summaryrefslogtreecommitdiffstats
path: root/fs/nfs
diff options
context:
space:
mode:
authorAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
committerAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
commitb4b6116a13633898cf868f2f103c96a90c4c20f8 (patch)
tree93d1b7e2cfcdf473d8d4ff3ad141fa864f8491f6 /fs/nfs
parentedd4be777c953e5faafc80d091d3084b4343f5d3 (diff)
downloadkernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.gz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.xz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.zip
fedora kernel: d9aad82f3319f3cfd1aebc01234254ef0c37ad84v3.3.2-1
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/Kconfig134
-rw-r--r--fs/nfs/Makefile26
-rw-r--r--fs/nfs/blocklayout/Makefile5
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1084
-rw-r--r--fs/nfs/blocklayout/blocklayout.h211
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c391
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c111
-rw-r--r--fs/nfs/blocklayout/extents.c909
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c403
-rw-r--r--fs/nfs/callback.h213
-rw-r--r--fs/nfs/callback_proc.c576
-rw-r--r--fs/nfs/callback_xdr.c997
-rw-r--r--fs/nfs/client.c2019
-rw-r--r--fs/nfs/delegation.c716
-rw-r--r--fs/nfs/delegation.h80
-rw-r--r--fs/nfs/dir.c2349
-rw-r--r--fs/nfs/direct.c1039
-rw-r--r--fs/nfs/dns_resolve.c372
-rw-r--r--fs/nfs/dns_resolve.h26
-rw-r--r--fs/nfs/file.c899
-rw-r--r--fs/nfs/fscache-index.c337
-rw-r--r--fs/nfs/fscache.c535
-rw-r--r--fs/nfs/fscache.h222
-rw-r--r--fs/nfs/getroot.c267
-rw-r--r--fs/nfs/idmap.c847
-rw-r--r--fs/nfs/inode.c1660
-rw-r--r--fs/nfs/internal.h463
-rw-r--r--fs/nfs/iostat.h71
-rw-r--r--fs/nfs/mount_clnt.c518
-rw-r--r--fs/nfs/namespace.c371
-rw-r--r--fs/nfs/nfs2xdr.c1157
-rw-r--r--fs/nfs/nfs3acl.c440
-rw-r--r--fs/nfs/nfs3proc.c891
-rw-r--r--fs/nfs/nfs3xdr.c2498
-rw-r--r--fs/nfs/nfs4_fs.h369
-rw-r--r--fs/nfs/nfs4filelayout.c952
-rw-r--r--fs/nfs/nfs4filelayout.h118
-rw-r--r--fs/nfs/nfs4filelayoutdev.c860
-rw-r--r--fs/nfs/nfs4namespace.c265
-rw-r--r--fs/nfs/nfs4proc.c6336
-rw-r--r--fs/nfs/nfs4renewd.c136
-rw-r--r--fs/nfs/nfs4state.c1850
-rw-r--r--fs/nfs/nfs4xdr.c7125
-rw-r--r--fs/nfs/nfsroot.c309
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c604
-rw-r--r--fs/nfs/objlayout/objlayout.c653
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c415
-rw-r--r--fs/nfs/pagelist.c505
-rw-r--r--fs/nfs/pnfs.c1554
-rw-r--r--fs/nfs/pnfs.h434
-rw-r--r--fs/nfs/pnfs_dev.c276
-rw-r--r--fs/nfs/proc.c747
-rw-r--r--fs/nfs/read.c712
-rw-r--r--fs/nfs/super.c3085
-rw-r--r--fs/nfs/symlink.c78
-rw-r--r--fs/nfs/sysctl.c90
-rw-r--r--fs/nfs/unlink.c597
-rw-r--r--fs/nfs/write.c1758
62 files changed, 53024 insertions, 0 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 00000000000..021d2cf6938
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,134 @@
+config NFS_FS
+ tristate "NFS client support"
+ depends on INET && FILE_LOCKING
+ select LOCKD
+ select SUNRPC
+ select NFS_ACL_SUPPORT if NFS_V3_ACL
+ help
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
+
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
+
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
+
+ If unsure, say N.
+
+config NFS_V3
+ bool "NFS client support for NFS version 3"
+ depends on NFS_FS
+ help
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
+
+ If unsure, say Y.
+
+config NFS_V3_ACL
+ bool "NFS client support for the NFSv3 ACL protocol extension"
+ depends on NFS_V3
+ help
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
+
+ If unsure, say N.
+
+config NFS_V4
+ bool "NFS client support for NFS version 4"
+ depends on NFS_FS
+ select SUNRPC_GSS
+ help
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
+
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
+
+ If unsure, say Y.
+
+config NFS_V4_1
+ bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+ select SUNRPC_BACKCHANNEL
+ select PNFS_FILE_LAYOUT
+ help
+ This option enables support for minor version 1 of the NFSv4 protocol
+ (RFC 5661) in the kernel's NFS client.
+
+ If unsure, say N.
+
+config PNFS_FILE_LAYOUT
+ tristate
+
+config PNFS_BLOCK
+ tristate
+ depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
+ default m
+
+config PNFS_OBJLAYOUT
+ tristate
+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ default m
+
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/filesystems/nfs/nfsroot.txt>.
+
+ Most people say N here.
+
+config NFS_FSCACHE
+ bool "Provide NFS client caching support"
+ depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+ help
+ Say Y here if you want NFS data to be cached locally on disc through
+ the general filesystem cache manager
+
+config NFS_USE_LEGACY_DNS
+ bool "Use the legacy NFS DNS resolver"
+ depends on NFS_V4
+ help
+ The kernel now provides a method for translating a host name into an
+ IP address. Select Y here if you would rather use your own DNS
+ resolver script.
+
+ If unsure, say N
+
+config NFS_USE_KERNEL_DNS
+ bool
+ depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+ select DNS_RESOLVER
+ select KEYS
+ default y
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
new file mode 100644
index 00000000000..b58613d0abb
--- /dev/null
+++ b/fs/nfs/Makefile
@@ -0,0 +1,26 @@
+#
+# Makefile for the Linux nfs filesystem routines.
+#
+
+obj-$(CONFIG_NFS_FS) += nfs.o
+
+nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
+ direct.o pagelist.o proc.o read.o symlink.o unlink.o \
+ write.o namespace.o mount_clnt.o \
+ dns_resolve.o cache_lib.o
+nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
+nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
+nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
+ delegation.o idmap.o \
+ callback.o callback_xdr.o callback_proc.o \
+ nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 00000000000..d5815505c02
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 00000000000..48cfac31f64
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1084 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h> /* struct bio */
+#include <linux/buffer_head.h> /* various write calls */
+#include <linux/prefetch.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+struct dentry *bl_device_pipe;
+wait_queue_head_t bl_wq;
+
+static void print_page(struct page *page)
+{
+ dprintk("PRINTPAGE page %p\n", page);
+ dprintk(" PagePrivate %d\n", PagePrivate(page));
+ dprintk(" PageUptodate %d\n", PageUptodate(page));
+ dprintk(" PageError %d\n", PageError(page));
+ dprintk(" PageDirty %d\n", PageDirty(page));
+ dprintk(" PageReferenced %d\n", PageReferenced(page));
+ dprintk(" PageLocked %d\n", PageLocked(page));
+ dprintk(" PageWriteback %d\n", PageWriteback(page));
+ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
+ dprintk("\n");
+}
+
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+ if (be->be_state == PNFS_BLOCK_NONE_DATA)
+ return 1;
+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+ return 0;
+ else
+ return !bl_is_sector_init(be->be_inval, isect);
+}
+
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+ return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA);
+}
+
+/* The data we are handed might be spread across several bios. We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+ struct kref refcnt;
+ void (*pnfs_callback) (void *data, int num_se);
+ void *data;
+ int bse_count;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+ struct parallel_io *rv;
+
+ rv = kmalloc(sizeof(*rv), GFP_NOFS);
+ if (rv) {
+ rv->data = data;
+ kref_init(&rv->refcnt);
+ rv->bse_count = 0;
+ }
+ return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+ kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+ dprintk("%s enter\n", __func__);
+ p->pnfs_callback(p->data, p->bse_count);
+ kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+ kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+ if (bio) {
+ get_parallel(bio->bi_private);
+ dprintk("%s submitting %s bio %u@%llu\n", __func__,
+ rw == READ ? "read" : "write",
+ bio->bi_size, (unsigned long long)bio->bi_sector);
+ submit_bio(rw, bio);
+ }
+ return NULL;
+}
+
+static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+ struct bio *bio;
+
+ npg = min(npg, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_NOIO, npg);
+ if (!bio && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (npg /= 2))
+ bio = bio_alloc(GFP_NOIO, npg);
+ }
+
+ if (bio) {
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ }
+ return bio;
+}
+
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+ sector_t isect, struct page *page,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+retry:
+ if (!bio) {
+ bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+ }
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ bio = bl_submit_bio(rw, bio);
+ goto retry;
+ }
+ return bio;
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (uptodate)
+ SetPageUptodate(page);
+ } while (bvec >= bio->bi_io_vec);
+ if (!uptodate) {
+ if (!rdata->pnfs_error)
+ rdata->pnfs_error = -EIO;
+ pnfs_set_lo_fail(rdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+ pnfs_ld_read_done(rdata);
+}
+
+static void
+bl_end_par_io_read(void *data, int unused)
+{
+ struct nfs_read_data *rdata = data;
+
+ rdata->task.tk_status = rdata->pnfs_error;
+ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&rdata->task.u.tk_work);
+}
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata)
+{
+ int i, hole;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t f_offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct page **pages = rdata->args.pages;
+ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+ rdata->npages, f_offset, count);
+
+ par = alloc_parallel(rdata);
+ if (!par)
+ goto use_mds;
+ par->pnfs_callback = bl_end_par_io_read;
+ /* At this point, we can no longer jump to use_mds */
+
+ isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+ /* Code assumes extents are page-aligned */
+ for (i = pg_index; i < rdata->npages; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ bl_put_extent(be);
+ bl_put_extent(cow_read);
+ bio = bl_submit_bio(READ, bio);
+ /* Get the next one */
+ be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+ isect, &cow_read);
+ if (!be) {
+ rdata->pnfs_error = -EIO;
+ goto out;
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ if (cow_read) {
+ sector_t cow_length = cow_read->be_length -
+ (isect - cow_read->be_f_offset);
+ extent_length = min(extent_length, cow_length);
+ }
+ }
+ hole = is_hole(be, isect);
+ if (hole && !cow_read) {
+ bio = bl_submit_bio(READ, bio);
+ /* Fill hole w/ zeroes w/o accessing device */
+ dprintk("%s Zeroing page for hole\n", __func__);
+ zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ print_page(pages[i]);
+ SetPageUptodate(pages[i]);
+ } else {
+ struct pnfs_block_extent *be_read;
+
+ be_read = (hole && cow_read) ? cow_read : be;
+ bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+ isect, pages[i], be_read,
+ bl_end_io_read, par);
+ if (IS_ERR(bio)) {
+ rdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+ }
+ isect += PAGE_CACHE_SECTORS;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+ if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+ rdata->res.eof = 1;
+ rdata->res.count = rdata->inode->i_size - f_offset;
+ } else {
+ rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+ }
+out:
+ bl_put_extent(be);
+ bl_put_extent(cow_read);
+ bl_submit_bio(READ, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+
+ use_mds:
+ dprintk("Giving up and using normal NFS\n");
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static void mark_extents_written(struct pnfs_block_layout *bl,
+ __u64 offset, __u32 count)
+{
+ sector_t isect, end;
+ struct pnfs_block_extent *be;
+ struct pnfs_block_short_extent *se;
+
+ dprintk("%s(%llu, %u)\n", __func__, offset, count);
+ if (count == 0)
+ return;
+ isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
+ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+ end >>= SECTOR_SHIFT;
+ while (isect < end) {
+ sector_t len;
+ be = bl_find_get_extent(bl, isect, NULL);
+ BUG_ON(!be); /* FIXME */
+ len = min(end, be->be_f_offset + be->be_length) - isect;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ se = bl_pop_one_short_extent(be->be_inval);
+ BUG_ON(!se);
+ bl_mark_for_commit(be, isect, len, se);
+ }
+ isect += len;
+ bl_put_extent(be);
+ }
+}
+
+static void bl_end_io_write_zero(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ /* This is the zeroing page we added */
+ end_page_writeback(page);
+ page_cache_release(page);
+ } while (bvec >= bio->bi_io_vec);
+
+ if (unlikely(!uptodate)) {
+ if (!wdata->pnfs_error)
+ wdata->pnfs_error = -EIO;
+ pnfs_set_lo_fail(wdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+static void bl_end_io_write(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+ if (!uptodate) {
+ if (!wdata->pnfs_error)
+ wdata->pnfs_error = -EIO;
+ pnfs_set_lo_fail(wdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+ if (likely(!wdata->pnfs_error)) {
+ /* Marks for LAYOUTCOMMIT */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ wdata->args.offset, wdata->args.count);
+ }
+ pnfs_ld_write_done(wdata);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data, int num_se)
+{
+ struct nfs_write_data *wdata = data;
+
+ if (unlikely(wdata->pnfs_error)) {
+ bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+ num_se);
+ }
+
+ wdata->task.tk_status = wdata->pnfs_error;
+ wdata->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&wdata->task.u.tk_work);
+}
+
+/* FIXME STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+ return;
+}
+
+/*
+ * map_block: map a requested I/0 block (isect) into an offset in the LVM
+ * block_device
+ */
+static void
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
+{
+ dprintk("%s enter be=%p\n", __func__, be);
+
+ set_buffer_mapped(bh);
+ bh->b_bdev = be->be_mdev;
+ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+ (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
+
+ dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
+ __func__, (unsigned long long)isect, (long)bh->b_blocknr,
+ bh->b_size);
+ return;
+}
+
+/* Given an unmapped page, zero it or read in page for COW, page is locked
+ * by caller.
+ */
+static int
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+{
+ struct buffer_head *bh = NULL;
+ int ret = 0;
+ sector_t isect;
+
+ dprintk("%s enter, %p\n", __func__, page);
+ BUG_ON(PageUptodate(page));
+ if (!cow_read) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ goto cleanup;
+ }
+
+ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+ if (!bh) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
+ map_block(bh, isect, cow_read);
+ if (!bh_uptodate_or_lock(bh))
+ ret = bh_submit_read(bh);
+ if (ret)
+ goto cleanup;
+ SetPageUptodate(page);
+
+cleanup:
+ bl_put_extent(cow_read);
+ if (bh)
+ free_buffer_head(bh);
+ if (ret) {
+ /* Need to mark layout with bad read...should now
+ * just use nfs4 for reads and writes.
+ */
+ mark_bad_read();
+ }
+ return ret;
+}
+
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+ struct pnfs_block_extent *cow_read)
+{
+ struct page *page;
+ int locked = 0;
+ page = find_get_page(inode->i_mapping, index);
+ if (page)
+ goto check_page;
+
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (unlikely(!page)) {
+ dprintk("%s oom\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+ locked = 1;
+
+check_page:
+ /* PageDirty: Other will write this out
+ * PageWriteback: Other is writing this out
+ * PageUptodate: It was read before
+ */
+ if (PageDirty(page) || PageWriteback(page)) {
+ print_page(page);
+ if (locked)
+ unlock_page(page);
+ page_cache_release(page);
+ return NULL;
+ }
+
+ if (!locked) {
+ lock_page(page);
+ locked = 1;
+ goto check_page;
+ }
+ if (!PageUptodate(page)) {
+ /* New page, readin or zero it */
+ init_page_for_write(page, cow_read);
+ }
+ set_page_writeback(page);
+ unlock_page(page);
+
+ return page;
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+{
+ int i, ret, npg_zero, pg_index, last = 0;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect, last_isect = 0, extent_length = 0;
+ struct parallel_io *par;
+ loff_t offset = wdata->args.offset;
+ size_t count = wdata->args.count;
+ struct page **pages = wdata->args.pages;
+ struct page *page;
+ pgoff_t index;
+ u64 temp;
+ int npg_per_block =
+ NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+ /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+ * We want to write each, and if there is an error set pnfs_error
+ * to have it redone using nfs.
+ */
+ par = alloc_parallel(wdata);
+ if (!par)
+ goto out_mds;
+ par->pnfs_callback = bl_end_par_io_write;
+ /* At this point, have to be more careful with error handling */
+
+ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+ be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+ if (!be || !is_writable(be, isect)) {
+ dprintk("%s no matching extents!\n", __func__);
+ goto out_mds;
+ }
+
+ /* First page inside INVALID extent */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else
+ goto out_mds;
+ temp = offset >> PAGE_CACHE_SHIFT;
+ npg_zero = do_div(temp, npg_per_block);
+ isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
+ (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+ extent_length = be->be_length - (isect - be->be_f_offset);
+
+fill_invalid_ext:
+ dprintk("%s need to zero %d pages\n", __func__, npg_zero);
+ for (;npg_zero > 0; npg_zero--) {
+ if (bl_is_sector_init(be->be_inval, isect)) {
+ dprintk("isect %llu already init\n",
+ (unsigned long long)isect);
+ goto next_page;
+ }
+ /* page ref released in bl_end_io_write_zero */
+ index = isect >> PAGE_CACHE_SECTOR_SHIFT;
+ dprintk("%s zero %dth page: index %lu isect %llu\n",
+ __func__, npg_zero, index,
+ (unsigned long long)isect);
+ page = bl_find_get_zeroing_page(wdata->inode, index,
+ cow_read);
+ if (unlikely(IS_ERR(page))) {
+ wdata->pnfs_error = PTR_ERR(page);
+ goto out;
+ } else if (page == NULL)
+ goto next_page;
+
+ ret = bl_mark_sectors_init(be->be_inval, isect,
+ PAGE_CACHE_SECTORS);
+ if (unlikely(ret)) {
+ dprintk("%s bl_mark_sectors_init fail %d\n",
+ __func__, ret);
+ end_page_writeback(page);
+ page_cache_release(page);
+ wdata->pnfs_error = ret;
+ goto out;
+ }
+ if (likely(!bl_push_one_short_extent(be->be_inval)))
+ par->bse_count++;
+ else {
+ end_page_writeback(page);
+ page_cache_release(page);
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ /* FIXME: This should be done in bi_end_io */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE);
+
+ bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
+ isect, page, be,
+ bl_end_io_write_zero, par);
+ if (IS_ERR(bio)) {
+ wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+next_page:
+ isect += PAGE_CACHE_SECTORS;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+ if (last)
+ goto write_done;
+ }
+ bio = bl_submit_bio(WRITE, bio);
+
+ /* Middle pages */
+ pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ for (i = pg_index; i < wdata->npages; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ bl_put_extent(be);
+ bio = bl_submit_bio(WRITE, bio);
+ /* Get the next one */
+ be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+ isect, NULL);
+ if (!be || !is_writable(be, isect)) {
+ wdata->pnfs_error = -EINVAL;
+ goto out;
+ }
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (likely(!bl_push_one_short_extent(
+ be->be_inval)))
+ par->bse_count++;
+ else {
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ }
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ ret = bl_mark_sectors_init(be->be_inval, isect,
+ PAGE_CACHE_SECTORS);
+ if (unlikely(ret)) {
+ dprintk("%s bl_mark_sectors_init fail %d\n",
+ __func__, ret);
+ wdata->pnfs_error = ret;
+ goto out;
+ }
+ }
+ bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+ isect, pages[i], be,
+ bl_end_io_write, par);
+ if (IS_ERR(bio)) {
+ wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
+ goto out;
+ }
+ isect += PAGE_CACHE_SECTORS;
+ last_isect = isect;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+
+ /* Last page inside INVALID extent */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ bio = bl_submit_bio(WRITE, bio);
+ temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+ npg_zero = npg_per_block - do_div(temp, npg_per_block);
+ if (npg_zero < npg_per_block) {
+ last = 1;
+ goto fill_invalid_ext;
+ }
+ }
+
+write_done:
+ wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+ if (count < wdata->res.count) {
+ wdata->res.count = count;
+ }
+out:
+ bl_put_extent(be);
+ bl_submit_bio(WRITE, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+out_mds:
+ bl_put_extent(be);
+ kfree(par);
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* FIXME - range ignored */
+static void
+release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
+{
+ int i;
+ struct pnfs_block_extent *be;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ while (!list_empty(&bl->bl_extents[i])) {
+ be = list_first_entry(&bl->bl_extents[i],
+ struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+}
+
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_inval_tracking *pos, *temp;
+ struct pnfs_block_short_extent *se, *stemp;
+
+ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+ list_del(&pos->it_link);
+ kfree(pos);
+ }
+
+ list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ }
+ return;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+
+ dprintk("%s enter\n", __func__);
+ release_extents(bl, NULL);
+ release_inval_marks(&bl->bl_inval);
+ kfree(bl);
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_block_layout *bl;
+
+ dprintk("%s enter\n", __func__);
+ bl = kzalloc(sizeof(*bl), gfp_flags);
+ if (!bl)
+ return NULL;
+ spin_lock_init(&bl->bl_ext_lock);
+ INIT_LIST_HEAD(&bl->bl_extents[0]);
+ INIT_LIST_HEAD(&bl->bl_extents[1]);
+ INIT_LIST_HEAD(&bl->bl_commit);
+ INIT_LIST_HEAD(&bl->bl_committing);
+ bl->bl_count = 0;
+ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
+ BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+ return &bl->bl_layout;
+}
+
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("%s enter\n", __func__);
+ kfree(lseg);
+}
+
+/* We pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge.
+ */
+static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *lseg;
+ int status;
+
+ dprintk("%s enter\n", __func__);
+ lseg = kzalloc(sizeof(*lseg), gfp_flags);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+ status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+ if (status) {
+ /* We don't want to call the full-blown bl_free_lseg,
+ * since on error extents were not touched.
+ */
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
+}
+
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+ dprintk("%s enter\n", __func__);
+ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+}
+
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+ struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+
+ dprintk("%s enter\n", __func__);
+ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+}
+
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+ if (mid) {
+ struct pnfs_block_dev *dev, *tmp;
+
+ /* No need to take bm_lock as we are last user freeing bm_devlist */
+ list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
+ list_del(&dev->bm_node);
+ bl_free_block_dev(dev);
+ }
+ kfree(mid);
+ }
+}
+
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+ struct nfs4_deviceid *d_id)
+{
+ struct pnfs_device *dev;
+ struct pnfs_block_dev *rv;
+ u32 max_resp_sz;
+ int max_pages;
+ struct page **pages = NULL;
+ int i, rc;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ max_pages = max_resp_sz >> PAGE_SHIFT;
+ dprintk("%s max_resp_sz %u max_pages %d\n",
+ __func__, max_resp_sz, max_pages);
+
+ dev = kmalloc(sizeof(*dev), GFP_NOFS);
+ if (!dev) {
+ dprintk("%s kmalloc failed\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ rv = ERR_PTR(-ENOMEM);
+ goto out_free;
+ }
+ }
+
+ memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+ dev->layout_type = LAYOUT_BLOCK_VOLUME;
+ dev->pages = pages;
+ dev->pgbase = 0;
+ dev->pglen = PAGE_SIZE * max_pages;
+ dev->mincount = 0;
+
+ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+ rc = nfs4_proc_getdeviceinfo(server, dev);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc) {
+ rv = ERR_PTR(rc);
+ goto out_free;
+ }
+
+ rv = nfs4_blk_decode_device(server, dev);
+ out_free:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+ kfree(dev);
+ return rv;
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+ struct block_mount_id *b_mt_id = NULL;
+ struct pnfs_devicelist *dlist = NULL;
+ struct pnfs_block_dev *bdev;
+ LIST_HEAD(block_disklist);
+ int status, i;
+
+ dprintk("%s enter\n", __func__);
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
+ if (!b_mt_id) {
+ status = -ENOMEM;
+ goto out_error;
+ }
+ /* Initialize nfs4 block layout mount id */
+ spin_lock_init(&b_mt_id->bm_lock);
+ INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+
+ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+ if (!dlist) {
+ status = -ENOMEM;
+ goto out_error;
+ }
+ dlist->eof = 0;
+ while (!dlist->eof) {
+ status = nfs4_proc_getdevicelist(server, fh, dlist);
+ if (status)
+ goto out_error;
+ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+ __func__, dlist->num_devs, dlist->eof);
+ for (i = 0; i < dlist->num_devs; i++) {
+ bdev = nfs4_blk_get_deviceinfo(server, fh,
+ &dlist->dev_id[i]);
+ if (IS_ERR(bdev)) {
+ status = PTR_ERR(bdev);
+ goto out_error;
+ }
+ spin_lock(&b_mt_id->bm_lock);
+ list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+ spin_unlock(&b_mt_id->bm_lock);
+ }
+ }
+ dprintk("%s SUCCESS\n", __func__);
+ server->pnfs_ld_data = b_mt_id;
+
+ out_return:
+ kfree(dlist);
+ return status;
+
+ out_error:
+ free_blk_mountid(b_mt_id);
+ goto out_return;
+}
+
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+ struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+
+ dprintk("%s enter\n", __func__);
+ free_blk_mountid(b_mt_id);
+ dprintk("%s RETURNS\n", __func__);
+ return 0;
+}
+
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+ .pg_init = pnfs_generic_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+ .pg_init = pnfs_generic_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+};
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+ .id = LAYOUT_BLOCK_VOLUME,
+ .name = "LAYOUT_BLOCK_VOLUME",
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = bl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .encode_layoutcommit = bl_encode_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .clear_layoutdriver = bl_clear_layoutdriver,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+};
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static int __init nfs4blocklayout_init(void)
+{
+ struct vfsmount *mnt;
+ struct path path;
+ int ret;
+
+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
+ if (ret)
+ goto out;
+
+ init_waitqueue_head(&bl_wq);
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt)) {
+ ret = PTR_ERR(mnt);
+ goto out_remove;
+ }
+
+ ret = vfs_path_lookup(mnt->mnt_root,
+ mnt,
+ NFS_PIPE_DIRNAME, 0, &path);
+ if (ret)
+ goto out_putrpc;
+
+ bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
+ &bl_upcall_ops, 0);
+ path_put(&path);
+ if (IS_ERR(bl_device_pipe)) {
+ ret = PTR_ERR(bl_device_pipe);
+ goto out_putrpc;
+ }
+out:
+ return ret;
+
+out_putrpc:
+ rpc_put_mount();
+out_remove:
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+ return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+ __func__);
+
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+ rpc_unlink(bl_device_pipe);
+ rpc_put_mount();
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 00000000000..e31a2df28e7
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,211 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "../pnfs.h"
+
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+
+struct block_mount_id {
+ spinlock_t bm_lock; /* protects list */
+ struct list_head bm_devlist; /* holds pnfs_block_dev */
+};
+
+struct pnfs_block_dev {
+ struct list_head bm_node;
+ struct nfs4_deviceid bm_mdevid; /* associated devid */
+ struct block_device *bm_mdev; /* meta device itself */
+};
+
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+};
+
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+
+struct my_tree {
+ sector_t mtt_step_size; /* Internal sector alignment */
+ struct list_head mtt_stub; /* Should be a radix tree */
+};
+
+struct pnfs_inval_markings {
+ spinlock_t im_lock;
+ struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
+ sector_t im_block_size; /* Server blocksize in sectors */
+ struct list_head im_extents; /* Short extents for INVAL->RW conversion */
+};
+
+struct pnfs_inval_tracking {
+ struct list_head it_link;
+ int it_sector;
+ int it_tags;
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+ struct kref be_refcnt;
+ struct list_head be_node; /* link into lseg list */
+ struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
+ struct block_device *be_mdev;
+ sector_t be_f_offset; /* the starting offset in the file */
+ sector_t be_length; /* the size of the extent */
+ sector_t be_v_offset; /* the starting offset in the volume */
+ enum exstate4 be_state; /* the state of this extent */
+ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+ struct list_head bse_node;
+ struct nfs4_deviceid bse_devid;
+ struct block_device *bse_mdev;
+ sector_t bse_f_offset; /* the starting offset in the file */
+ sector_t bse_length; /* the size of the extent */
+};
+
+static inline void
+BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+ spin_lock_init(&marks->im_lock);
+ INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+ INIT_LIST_HEAD(&marks->im_extents);
+ marks->im_block_size = blocksize;
+ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+ blocksize);
+}
+
+enum extentclass4 {
+ RW_EXTENT = 0, /* READWRTE and INVAL */
+ RO_EXTENT = 1, /* READ and NONE */
+ EXTENT_LISTS = 2,
+};
+
+static inline int bl_choose_list(enum exstate4 state)
+{
+ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+ return RO_EXTENT;
+ else
+ return RW_EXTENT;
+}
+
+struct pnfs_block_layout {
+ struct pnfs_layout_hdr bl_layout;
+ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ spinlock_t bl_ext_lock; /* Protects list manipulation */
+ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
+ struct list_head bl_commit; /* Needs layout commit */
+ struct list_head bl_committing; /* Layout committing */
+ unsigned int bl_count; /* entries in bl_commit */
+ sector_t bl_blocksize; /* Server blocksize in sectors */
+};
+
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+ return BLK_LO2EXT(lseg->pls_layout);
+}
+
+struct bl_dev_msg {
+ int32_t status;
+ uint32_t major, minor;
+};
+
+struct bl_msg_hdr {
+ u8 type;
+ u16 totallen; /* length of entire message, including hdr itself */
+};
+
+extern struct dentry *bl_device_pipe;
+extern wait_queue_head_t bl_wq;
+
+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
+
+/* blocklayoutdev.c */
+ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+
+/* blocklayoutdm.c */
+void bl_free_block_dev(struct pnfs_block_dev *bdev);
+
+/* extents.c */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read);
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length);
+void bl_put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *bl_alloc_extent(void);
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status);
+int bl_add_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 00000000000..d08ba9107fd
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,391 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
+}
+
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+ struct block_device *bd;
+
+ dprintk("%s enter\n", __func__);
+ bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(bd))
+ goto fail;
+ return bd;
+fail:
+ dprintk("%s failed to open device : %ld\n",
+ __func__, PTR_ERR(bd));
+ return NULL;
+}
+
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ return blkdev_put(bdev, FMODE_READ);
+}
+
+static struct bl_dev_msg bl_mount_reply;
+
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&bl_wq);
+
+ return mlen;
+}
+
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ if (msg->errno >= 0)
+ return;
+ wake_up(&bl_wq);
+}
+
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev)
+{
+ struct pnfs_block_dev *rv;
+ struct block_device *bd = NULL;
+ struct rpc_pipe_msg msg;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_MOUNT,
+ .totallen = dev->mincount,
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+ struct bl_dev_msg *reply = &bl_mount_reply;
+ int offset, len, i, rc;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+ dev->mincount);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+ if (!msg.data) {
+ rv = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg.data;
+ len = dev->mincount;
+ offset = sizeof(bl_msg);
+ for (i = 0; len > 0; i++) {
+ memcpy(&dataptr[offset], page_address(dev->pages[i]),
+ len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ offset += PAGE_CACHE_SIZE;
+ }
+ msg.len = sizeof(bl_msg) + dev->mincount;
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&bl_wq, &wq);
+ rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+ if (rc < 0) {
+ remove_wait_queue(&bl_wq, &wq);
+ rv = ERR_PTR(rc);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ dprintk("%s failed to open device: %d\n",
+ __func__, reply->status);
+ rv = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+ if (IS_ERR(bd)) {
+ rc = PTR_ERR(bd);
+ dprintk("%s failed to open device : %d\n", __func__, rc);
+ rv = ERR_PTR(rc);
+ goto out;
+ }
+
+ rv = kzalloc(sizeof(*rv), GFP_NOFS);
+ if (!rv) {
+ rv = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ rv->bm_mdev = bd;
+ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+ dprintk("%s Created device %s with bd_block_size %u\n",
+ __func__,
+ bd->bd_disk->disk_name,
+ bd->bd_block_size);
+
+out:
+ kfree(msg.data);
+ return rv;
+}
+
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+ struct nfs4_deviceid *id)
+{
+ struct block_device *rv = NULL;
+ struct block_mount_id *mid;
+ struct pnfs_block_dev *dev;
+
+ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+ mid = BLK_ID(lo);
+ spin_lock(&mid->bm_lock);
+ list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+ if (memcmp(id->data, dev->bm_mdevid.data,
+ NFS4_DEVICEID4_SIZE) == 0) {
+ rv = dev->bm_mdev;
+ goto out;
+ }
+ }
+ out:
+ spin_unlock(&mid->bm_lock);
+ dprintk("%s returning %p\n", __func__, rv);
+ return rv;
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int i, status = -EIO;
+ uint32_t count;
+ struct pnfs_block_extent *be = NULL, *save;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ __be32 *p;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err;
+
+ count = be32_to_cpup(p++);
+
+ dprintk("%s enter, number of extents %i\n", __func__, count);
+ p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+ if (unlikely(!p))
+ goto out_err;
+
+ /* Decode individual extents, putting them in temporary
+ * staging area until whole layout is decoded to make error
+ * recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ be = bl_alloc_extent();
+ if (!be) {
+ status = -ENOMEM;
+ goto out_err;
+ }
+ memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+ be->be_mdev = translate_devid(lo, &be->be_devid);
+ if (!be->be_mdev)
+ goto out_err;
+
+ /* The next three values are read in as bytes,
+ * but stored as 512-byte sector lengths
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_err;
+ be->be_state = be32_to_cpup(p++);
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+ be->be_inval = &bl->bl_inval;
+ if (verify_extent(be, &lv)) {
+ dprintk("%s verify failed\n", __func__);
+ goto out_err;
+ }
+ list_add_tail(&be->be_node, &extents);
+ }
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ /* Extents decoded properly, now try to merge them in to
+ * existing layout extents.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ list_for_each_entry_safe(be, save, &extents, be_node) {
+ list_del(&be->be_node);
+ status = bl_add_merge_extent(bl, be);
+ if (status) {
+ spin_unlock(&bl->bl_ext_lock);
+ /* This is a fairly catastrophic error, as the
+ * entire layout extent lists are now corrupted.
+ * We should have some way to distinguish this.
+ */
+ be = NULL;
+ goto out_err;
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ status = 0;
+ out:
+ __free_page(scratch);
+ dprintk("%s returns %i\n", __func__, status);
+ return status;
+
+ out_err:
+ bl_put_extent(be);
+ while (!list_empty(&extents)) {
+ be = list_first_entry(&extents, struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ }
+ goto out;
+}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 00000000000..d055c755807
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Fred Isaman <iisaman@umich.edu>
+ * Andy Adamson <andros@citi.umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void dev_remove(dev_t dev)
+{
+ struct rpc_pipe_msg msg;
+ struct bl_dev_msg bl_umount_request;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_UMOUNT,
+ .totallen = sizeof(bl_umount_request),
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+
+ dprintk("Entering %s\n", __func__);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+ if (!msg.data)
+ goto out;
+
+ memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+ bl_umount_request.major = MAJOR(dev);
+ bl_umount_request.minor = MINOR(dev);
+
+ memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg.data;
+ memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+ msg.len = sizeof(bl_msg) + bl_msg.totallen;
+
+ add_wait_queue(&bl_wq, &wq);
+ if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+ remove_wait_queue(&bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bl_wq, &wq);
+
+out:
+ kfree(msg.data);
+}
+
+/*
+ * Release meta device
+ */
+static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+ int rv;
+
+ dprintk("%s Releasing\n", __func__);
+ rv = nfs4_blkdev_put(bdev->bm_mdev);
+ if (rv)
+ printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+ __func__, rv);
+
+ dev_remove(bdev->bm_mdev->bd_dev);
+}
+
+void bl_free_block_dev(struct pnfs_block_dev *bdev)
+{
+ if (bdev) {
+ if (bdev->bm_mdev) {
+ dprintk("%s Removing DM device: %d:%d\n",
+ __func__,
+ MAJOR(bdev->bm_mdev->bd_dev),
+ MINOR(bdev->bm_mdev->bd_dev));
+ nfs4_blk_metadev_release(bdev);
+ }
+ kfree(bdev);
+ }
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 00000000000..1abac09f7cd
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,909 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@citi.umich.edu>
+ * Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include "blocklayout.h"
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN 1
+#define EXTENT_IN_COMMIT 2
+#define INTERNAL_EXISTS MY_MAX_TAGS
+#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
+
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+ sector_t tmp = s; /* Since do_div modifies its argument */
+ return s - do_div(tmp, base);
+}
+
+static inline sector_t normalize_up(sector_t s, int base)
+{
+ return normalize(s + base - 1, base);
+}
+
+/* Complete stub using list while determine API wanted */
+
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree *tree, u64 s)
+{
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu) enter\n", __func__, s);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s)
+ return pos->it_tags & INTERNAL_MASK;
+ else
+ break;
+ }
+ return -ENOENT;
+}
+
+static inline
+int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
+{
+ int32_t tags;
+
+ dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+ s = normalize(s, tree->mtt_step_size);
+ tags = _find_entry(tree, s);
+ if ((tags < 0) || !(tags & (1 << tag)))
+ return 0;
+ else
+ return 1;
+}
+
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
+ struct pnfs_inval_tracking *storage)
+{
+ int found = 0;
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s) {
+ found = 1;
+ break;
+ } else
+ break;
+ }
+ if (found) {
+ pos->it_tags |= (1 << tag);
+ return 0;
+ } else {
+ struct pnfs_inval_tracking *new;
+ new = storage;
+ new->it_sector = s;
+ new->it_tags = (1 << tag);
+ list_add(&new->it_link, &pos->it_link);
+ return 1;
+ }
+}
+
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
+{
+ u64 i;
+
+ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+ for (i = normalize(s, tree->mtt_step_size); i < s + length;
+ i += tree->mtt_step_size)
+ if (_add_entry(tree, i, tag, NULL))
+ return -ENOMEM;
+ return 0;
+}
+
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct pnfs_inval_markings *marks,
+ u64 offset, u64 length)
+{
+ u64 start, end, s;
+ int count, i, used = 0, status = -ENOMEM;
+ struct pnfs_inval_tracking **storage;
+ struct my_tree *tree = &marks->im_tree;
+
+ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+ start = normalize(offset, tree->mtt_step_size);
+ end = normalize_up(offset + length, tree->mtt_step_size);
+ count = (int)(end - start) / (int)tree->mtt_step_size;
+
+ /* Pre-malloc what memory we might need */
+ storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+ if (!storage)
+ return -ENOMEM;
+ for (i = 0; i < count; i++) {
+ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+ GFP_NOFS);
+ if (!storage[i])
+ goto out_cleanup;
+ }
+
+ spin_lock_bh(&marks->im_lock);
+ for (s = start; s < end; s += tree->mtt_step_size)
+ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+ spin_unlock_bh(&marks->im_lock);
+
+ status = 0;
+
+ out_cleanup:
+ for (i = used; i < count; i++) {
+ if (!storage[i])
+ break;
+ kfree(storage[i]);
+ }
+ kfree(storage);
+ return status;
+}
+
+/* We are relying on page lock to serialize this */
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
+{
+ int rv;
+
+ spin_lock_bh(&marks->im_lock);
+ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+ spin_unlock_bh(&marks->im_lock);
+ return rv;
+}
+
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
+{
+ struct pnfs_inval_tracking *pos;
+ u64 expect = 0;
+
+ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector >= end)
+ continue;
+ if (!expect) {
+ if ((pos->it_sector == end - tree->mtt_step_size) &&
+ (pos->it_tags & (1 << tag))) {
+ expect = pos->it_sector - tree->mtt_step_size;
+ if (pos->it_sector < tree->mtt_step_size || expect < start)
+ return 1;
+ continue;
+ } else {
+ return 0;
+ }
+ }
+ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+ return 0;
+ expect -= tree->mtt_step_size;
+ if (expect < start)
+ return 1;
+ }
+ return 0;
+}
+
+static int is_range_written(struct pnfs_inval_markings *marks,
+ sector_t start, sector_t end)
+{
+ int rv;
+
+ spin_lock_bh(&marks->im_lock);
+ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+ spin_unlock_bh(&marks->im_lock);
+ return rv;
+}
+
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Currently assumes offset is page-aligned
+ */
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length)
+{
+ sector_t start, end;
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n",
+ __func__, (u64)offset, (u64)length);
+
+ start = normalize(offset, marks->im_block_size);
+ end = normalize_up(offset + length, marks->im_block_size);
+ if (_preload_range(marks, start, end - start))
+ goto outerr;
+
+ spin_lock_bh(&marks->im_lock);
+ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+ goto out_unlock;
+ spin_unlock_bh(&marks->im_lock);
+
+ return 0;
+
+out_unlock:
+ spin_unlock_bh(&marks->im_lock);
+outerr:
+ return -ENOMEM;
+}
+
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+static int mark_written_sectors(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length)
+{
+ int status;
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+ (u64)offset, (u64)length);
+ spin_lock_bh(&marks->im_lock);
+ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+ spin_unlock_bh(&marks->im_lock);
+ return status;
+}
+
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+ dprintk("PRINT SHORT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->bse_length);
+ }
+}
+
+static void print_clist(struct list_head *list, unsigned int count)
+{
+ struct pnfs_block_short_extent *be;
+ unsigned int i = 0;
+
+ ifdebug(FACILITY) {
+ printk(KERN_DEBUG "****************\n");
+ printk(KERN_DEBUG "Extent list looks like:\n");
+ list_for_each_entry(be, list, bse_node) {
+ i++;
+ print_short_extent(be);
+ }
+ if (i != count)
+ printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
+ printk(KERN_DEBUG "****************\n");
+ }
+}
+
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to bl_add_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+ struct pnfs_block_short_extent *new)
+{
+ struct list_head *clist = &bl->bl_commit;
+ struct pnfs_block_short_extent *old, *save;
+ sector_t end = new->bse_f_offset + new->bse_length;
+
+ dprintk("%s enter\n", __func__);
+ print_short_extent(new);
+ print_clist(clist, bl->bl_count);
+ bl->bl_count++;
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe(old, save, clist, bse_node) {
+ if (new->bse_f_offset < old->bse_f_offset)
+ break;
+ if (end <= old->bse_f_offset + old->bse_length) {
+ /* Range is already in list */
+ bl->bl_count--;
+ kfree(new);
+ return;
+ } else if (new->bse_f_offset <=
+ old->bse_f_offset + old->bse_length) {
+ /* new overlaps or abuts existing be */
+ if (new->bse_mdev == old->bse_mdev) {
+ /* extend new to fully replace old */
+ new->bse_length += new->bse_f_offset -
+ old->bse_f_offset;
+ new->bse_f_offset = old->bse_f_offset;
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ }
+ /* Note that if we never hit the above break, old will not point to a
+ * valid extent. However, in that case &old->bse_node==list.
+ */
+ list_add_tail(&new->bse_node, &old->bse_node);
+ /* Scan forward for overlaps. If we find any, extend new and
+ * remove the overlapped extent.
+ */
+ old = list_prepare_entry(new, clist, bse_node);
+ list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+ if (end < old->bse_f_offset)
+ break;
+ /* new overlaps or abuts old */
+ if (new->bse_mdev == old->bse_mdev) {
+ if (end < old->bse_f_offset + old->bse_length) {
+ /* extend new to fully cover old */
+ end = old->bse_f_offset + old->bse_length;
+ new->bse_length = end - new->bse_f_offset;
+ }
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ dprintk("%s: after merging\n", __func__);
+ print_clist(clist, bl->bl_count);
+}
+
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+ */
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length,
+ struct pnfs_block_short_extent *new)
+{
+ sector_t new_end, end = offset + length;
+ struct pnfs_block_layout *bl = container_of(be->be_inval,
+ struct pnfs_block_layout,
+ bl_inval);
+
+ mark_written_sectors(be->be_inval, offset, length);
+ /* We want to add the range to commit list, but it must be
+ * block-normalized, and verified that the normalized range has
+ * been entirely written to disk.
+ */
+ new->bse_f_offset = offset;
+ offset = normalize(offset, bl->bl_blocksize);
+ if (offset < new->bse_f_offset) {
+ if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+ new->bse_f_offset = offset;
+ else
+ new->bse_f_offset = offset + bl->bl_blocksize;
+ }
+ new_end = normalize_up(end, bl->bl_blocksize);
+ if (end < new_end) {
+ if (is_range_written(be->be_inval, end, new_end))
+ end = new_end;
+ else
+ end = new_end - bl->bl_blocksize;
+ }
+ if (end <= new->bse_f_offset) {
+ kfree(new);
+ return 0;
+ }
+ new->bse_length = end - new->bse_f_offset;
+ new->bse_devid = be->be_devid;
+ new->bse_mdev = be->be_mdev;
+
+ spin_lock(&bl->bl_ext_lock);
+ add_to_commitlist(bl, new);
+ spin_unlock(&bl->bl_ext_lock);
+ return 0;
+}
+
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+ dprintk("PRINT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->be_length);
+ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
+ dprintk(" be_state %d\n", be->be_state);
+ }
+}
+
+static void
+destroy_extent(struct kref *kref)
+{
+ struct pnfs_block_extent *be;
+
+ be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+ dprintk("%s be=%p\n", __func__, be);
+ kfree(be);
+}
+
+void
+bl_put_extent(struct pnfs_block_extent *be)
+{
+ if (be) {
+ dprintk("%s enter %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_put(&be->be_refcnt, destroy_extent);
+ }
+}
+
+struct pnfs_block_extent *bl_alloc_extent(void)
+{
+ struct pnfs_block_extent *be;
+
+ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+ if (!be)
+ return NULL;
+ INIT_LIST_HEAD(&be->be_node);
+ kref_init(&be->be_refcnt);
+ be->be_inval = NULL;
+ return be;
+}
+
+static void print_elist(struct list_head *list)
+{
+ struct pnfs_block_extent *be;
+ dprintk("****************\n");
+ dprintk("Extent list looks like:\n");
+ list_for_each_entry(be, list, be_node) {
+ print_bl_extent(be);
+ }
+ dprintk("****************\n");
+}
+
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+ /* Note this assumes new->be_f_offset >= old->be_f_offset */
+ return (new->be_state == old->be_state) &&
+ ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+ ((new->be_v_offset - old->be_v_offset ==
+ new->be_f_offset - old->be_f_offset) &&
+ new->be_mdev == old->be_mdev));
+}
+
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See bl_find_get_extent for list constraints.
+ *
+ * Refcount on new is already set. If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * bl->bl_ext_lock is held by caller.
+ */
+int
+bl_add_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be, *tmp;
+ sector_t end = new->be_f_offset + new->be_length;
+ struct list_head *list;
+
+ dprintk("%s enter with be=%p\n", __func__, new);
+ print_bl_extent(new);
+ list = &bl->bl_extents[bl_choose_list(new->be_state)];
+ print_elist(list);
+
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+ if (new->be_f_offset >= be->be_f_offset + be->be_length)
+ break;
+ if (new->be_f_offset >= be->be_f_offset) {
+ if (end <= be->be_f_offset + be->be_length) {
+ /* new is a subset of existing be*/
+ if (extents_consistent(be, new)) {
+ dprintk("%s: new is subset, ignoring\n",
+ __func__);
+ bl_put_extent(new);
+ return 0;
+ } else {
+ goto out_err;
+ }
+ } else {
+ /* |<-- be -->|
+ * |<-- new -->| */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ new->be_length += new->be_f_offset -
+ be->be_f_offset;
+ new->be_f_offset = be->be_f_offset;
+ new->be_v_offset = be->be_v_offset;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ } else if (end >= be->be_f_offset + be->be_length) {
+ /* new extent overlap existing be */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else {
+ goto out_err;
+ }
+ } else if (end > be->be_f_offset) {
+ /* |<-- be -->|
+ *|<-- new -->| */
+ if (extents_consistent(new, be)) {
+ /* extend new to fully replace be */
+ new->be_length += be->be_f_offset + be->be_length -
+ new->be_f_offset - new->be_length;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ }
+ /* Note that if we never hit the above break, be will not point to a
+ * valid extent. However, in that case &be->be_node==list.
+ */
+ list_add(&new->be_node, &be->be_node);
+ dprintk("%s: inserting new\n", __func__);
+ print_elist(list);
+ /* FIXME - The per-list consistency checks have all been done,
+ * should now check cross-list consistency.
+ */
+ return 0;
+
+ out_err:
+ bl_put_extent(new);
+ return -EIO;
+}
+
+/* Returns extent, or NULL. If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID. Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read)
+{
+ struct pnfs_block_extent *be, *cow, *ret;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ cow = ret = NULL;
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ if (!ret)
+ ret = be;
+ else if (be->be_state != PNFS_BLOCK_READ_DATA)
+ bl_put_extent(be);
+ else
+ cow = be;
+ break;
+ }
+ }
+ if (ret &&
+ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+ break;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ if (cow_read)
+ *cow_read = cow;
+ print_bl_extent(ret);
+ return ret;
+}
+
+/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+ struct pnfs_block_extent *be, *ret = NULL;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ if (ret)
+ break;
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ ret = be;
+ break;
+ }
+ }
+ }
+ print_bl_extent(ret);
+ return ret;
+}
+
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_short_extent *lce, *save;
+ unsigned int count = 0;
+ __be32 *p, *xdr_start;
+
+ dprintk("%s enter\n", __func__);
+ /* BUG - creation of bl_commit is buggy - need to wait for
+ * entire block to be marked WRITTEN before it can be added.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ /* Want to adjust for possible truncate */
+ /* We now want to adjust argument range */
+
+ /* XDR encode the ranges found */
+ xdr_start = xdr_reserve_space(xdr, 8);
+ if (!xdr_start)
+ goto out;
+ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+ if (!p)
+ break;
+ p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ list_del(&lce->bse_node);
+ list_add_tail(&lce->bse_node, &bl->bl_committing);
+ bl->bl_count--;
+ count++;
+ }
+ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+ xdr_start[1] = cpu_to_be32(count);
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ dprintk("%s found %i ranges\n", __func__, count);
+ return 0;
+}
+
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+ struct pnfs_block_extent *orig,
+ sector_t offset, sector_t length, int state)
+{
+ kref_init(&new->be_refcnt);
+ /* don't need to INIT_LIST_HEAD(&new->be_node) */
+ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+ new->be_mdev = orig->be_mdev;
+ new->be_f_offset = offset;
+ new->be_length = length;
+ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+ new->be_state = state;
+ new->be_inval = orig->be_inval;
+}
+
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+ struct pnfs_block_extent *storage)
+{
+ struct pnfs_block_extent *prev;
+
+ if (!storage)
+ goto no_merge;
+ if (&be->be_node == head || be->be_node.prev == head)
+ goto no_merge;
+ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+ !extents_consistent(prev, be))
+ goto no_merge;
+ _prep_new_extent(storage, prev, prev->be_f_offset,
+ prev->be_length + be->be_length, prev->be_state);
+ list_replace(&prev->be_node, &storage->be_node);
+ bl_put_extent(prev);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ return storage;
+
+ no_merge:
+ kfree(storage);
+ return be;
+}
+
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+ u64 rv = offset + length;
+ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+ struct pnfs_block_extent *children[3];
+ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+ int i = 0, j;
+
+ dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+ /* Create storage for up to three new extents e1, e2, e3 */
+ e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
+ e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
+ e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
+ /* BUG - we are ignoring any failure */
+ if (!e1 || !e2 || !e3)
+ goto out_nosplit;
+
+ spin_lock(&bl->bl_ext_lock);
+ be = bl_find_get_extent_locked(bl, offset);
+ rv = be->be_f_offset + be->be_length;
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+ spin_unlock(&bl->bl_ext_lock);
+ goto out_nosplit;
+ }
+ /* Add e* to children, bumping e*'s krefs */
+ if (be->be_f_offset != offset) {
+ _prep_new_extent(e1, be, be->be_f_offset,
+ offset - be->be_f_offset,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e1;
+ print_bl_extent(e1);
+ } else
+ merge1 = e1;
+ _prep_new_extent(e2, be, offset,
+ min(length, be->be_f_offset + be->be_length - offset),
+ PNFS_BLOCK_READWRITE_DATA);
+ children[i++] = e2;
+ print_bl_extent(e2);
+ if (offset + length < be->be_f_offset + be->be_length) {
+ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+ be->be_f_offset + be->be_length -
+ offset - length,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e3;
+ print_bl_extent(e3);
+ } else
+ merge2 = e3;
+
+ /* Remove be from list, and insert the e* */
+ /* We don't get refs on e*, since this list is the base reference
+ * set when init'ed.
+ */
+ if (i < 3)
+ children[i] = NULL;
+ new = children[0];
+ list_replace(&be->be_node, &new->be_node);
+ bl_put_extent(be);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+ for (j = 1; j < i; j++) {
+ old = new;
+ new = children[j];
+ list_add(&new->be_node, &old->be_node);
+ }
+ if (merge2) {
+ /* This is a HACK, should just create a _back_merge function */
+ new = list_entry(new->be_node.next,
+ struct pnfs_block_extent, be_node);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ /* Since we removed the base reference above, be is now scheduled for
+ * destruction.
+ */
+ bl_put_extent(be);
+ dprintk("%s returns %llu after split\n", __func__, rv);
+ return rv;
+
+ out_nosplit:
+ kfree(e1);
+ kfree(e2);
+ kfree(e3);
+ dprintk("%s returns %llu without splitting\n", __func__, rv);
+ return rv;
+}
+
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status)
+{
+ struct pnfs_block_short_extent *lce, *save;
+
+ dprintk("%s status %d\n", __func__, status);
+ list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
+ if (likely(!status)) {
+ u64 offset = lce->bse_f_offset;
+ u64 end = offset + lce->bse_length;
+
+ do {
+ offset = set_to_rw(bl, offset, end - offset);
+ } while (offset < end);
+ list_del(&lce->bse_node);
+
+ kfree(lce);
+ } else {
+ list_del(&lce->bse_node);
+ spin_lock(&bl->bl_ext_lock);
+ add_to_commitlist(bl, lce);
+ spin_unlock(&bl->bl_ext_lock);
+ }
+ }
+}
+
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *new;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (unlikely(!new))
+ return -ENOMEM;
+
+ spin_lock_bh(&marks->im_lock);
+ list_add(&new->bse_node, &marks->im_extents);
+ spin_unlock_bh(&marks->im_lock);
+
+ return 0;
+}
+
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+ struct pnfs_block_short_extent *rv = NULL;
+
+ spin_lock_bh(&marks->im_lock);
+ if (!list_empty(&marks->im_extents)) {
+ rv = list_entry((&marks->im_extents)->next,
+ struct pnfs_block_short_extent, bse_node);
+ list_del_init(&rv->bse_node);
+ }
+ spin_unlock_bh(&marks->im_lock);
+
+ return rv;
+}
+
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+ struct pnfs_block_short_extent *se = NULL, *tmp;
+
+ if (num_to_free <= 0)
+ return;
+
+ spin_lock(&marks->im_lock);
+ list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+ list_del(&se->bse_node);
+ kfree(se);
+ if (--num_to_free == 0)
+ break;
+ }
+ spin_unlock(&marks->im_lock);
+
+ BUG_ON(num_to_free > 0);
+}
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 00000000000..c98b439332f
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (atomic_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete_all(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ atomic_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ atomic_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+ struct vfsmount *mnt;
+ struct path path;
+ int ret;
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path);
+ if (ret)
+ goto err;
+ ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
+ path_put(&path);
+ if (!ret)
+ return ret;
+err:
+ rpc_put_mount();
+ return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+ rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 00000000000..7cf6cafcc00
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
new file mode 100644
index 00000000000..516f3375e06
--- /dev/null
+++ b/fs/nfs/callback.c
@@ -0,0 +1,403 @@
+/*
+ * linux/fs/nfs/callback.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback handling
+ */
+
+#include <linux/completion.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/nfs_fs.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <net/inet_sock.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+struct nfs_callback_data {
+ unsigned int users;
+ struct svc_serv *serv;
+ struct svc_rqst *rqst;
+ struct task_struct *task;
+};
+
+static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
+static DEFINE_MUTEX(nfs_callback_mutex);
+static struct svc_program nfs4_callback_program;
+
+unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = strict_strtoul(val, 0, &num);
+ if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+static struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+
+/*
+ * This is the NFSv4 callback kernel thread.
+ */
+static int
+nfs4_callback_svc(void *vrqstp)
+{
+ int err, preverr = 0;
+ struct svc_rqst *rqstp = vrqstp;
+
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ /*
+ * Listen for a request on the socket
+ */
+ err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
+ if (err == -EAGAIN || err == -EINTR) {
+ preverr = err;
+ continue;
+ }
+ if (err < 0) {
+ if (err != preverr) {
+ printk(KERN_WARNING "%s: unexpected error "
+ "from svc_recv (%d)\n", __func__, err);
+ preverr = err;
+ }
+ schedule_timeout_uninterruptible(HZ);
+ continue;
+ }
+ preverr = err;
+ svc_process(rqstp);
+ }
+ return 0;
+}
+
+/*
+ * Prepare to bring up the NFSv4 callback service
+ */
+struct svc_rqst *
+nfs4_callback_up(struct svc_serv *serv)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+ if (ret <= 0)
+ goto out_err;
+ nfs_callback_tcpport = ret;
+ dprintk("NFS: Callback listener port = %u (af %u)\n",
+ nfs_callback_tcpport, PF_INET);
+
+ ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+ if (ret > 0) {
+ nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u)\n",
+ nfs_callback_tcpport6, PF_INET6);
+ } else if (ret == -EAFNOSUPPORT)
+ ret = 0;
+ else
+ goto out_err;
+
+ return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+
+out_err:
+ if (ret == 0)
+ ret = -ENOMEM;
+ return ERR_PTR(ret);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * The callback service for NFSv4.1 callbacks
+ */
+static int
+nfs41_callback_svc(void *vrqstp)
+{
+ struct svc_rqst *rqstp = vrqstp;
+ struct svc_serv *serv = rqstp->rq_server;
+ struct rpc_rqst *req;
+ int error;
+ DEFINE_WAIT(wq);
+
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+ spin_lock_bh(&serv->sv_cb_lock);
+ if (!list_empty(&serv->sv_cb_list)) {
+ req = list_first_entry(&serv->sv_cb_list,
+ struct rpc_rqst, rq_bc_list);
+ list_del(&req->rq_bc_list);
+ spin_unlock_bh(&serv->sv_cb_lock);
+ dprintk("Invoking bc_svc_process()\n");
+ error = bc_svc_process(serv, req, rqstp);
+ dprintk("bc_svc_process() returned w/ error code= %d\n",
+ error);
+ } else {
+ spin_unlock_bh(&serv->sv_cb_lock);
+ schedule();
+ }
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ }
+ return 0;
+}
+
+/*
+ * Bring up the NFSv4.1 callback service
+ */
+struct svc_rqst *
+nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
+{
+ struct svc_rqst *rqstp;
+ int ret;
+
+ /*
+ * Create an svc_sock for the back channel service that shares the
+ * fore channel connection.
+ * Returns the input port (0) and sets the svc_serv bc_xprt on success
+ */
+ ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0,
+ SVC_SOCK_ANONYMOUS);
+ if (ret < 0) {
+ rqstp = ERR_PTR(ret);
+ goto out;
+ }
+
+ /*
+ * Save the svc_serv in the transport so that it can
+ * be referenced when the session backchannel is initialized
+ */
+ xprt->bc_serv = serv;
+
+ INIT_LIST_HEAD(&serv->sv_cb_list);
+ spin_lock_init(&serv->sv_cb_lock);
+ init_waitqueue_head(&serv->sv_cb_waitq);
+ rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+ if (IS_ERR(rqstp)) {
+ svc_xprt_put(serv->sv_bc_xprt);
+ serv->sv_bc_xprt = NULL;
+ }
+out:
+ dprintk("--> %s return %ld\n", __func__,
+ IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+ return rqstp;
+}
+
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+ struct svc_serv *serv, struct rpc_xprt *xprt,
+ struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+ if (minorversion) {
+ *rqstpp = nfs41_callback_up(serv, xprt);
+ *callback_svc = nfs41_callback_svc;
+ }
+ return minorversion;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct nfs_callback_data *cb_info)
+{
+ if (minorversion)
+ xprt->bc_serv = cb_info->serv;
+}
+#else
+static inline int nfs_minorversion_callback_svc_setup(u32 minorversion,
+ struct svc_serv *serv, struct rpc_xprt *xprt,
+ struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
+{
+ return 0;
+}
+
+static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
+ struct nfs_callback_data *cb_info)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Bring up the callback thread if it is not already up.
+ */
+int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+{
+ struct svc_serv *serv = NULL;
+ struct svc_rqst *rqstp;
+ int (*callback_svc)(void *vrqstp);
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+ char svc_name[12];
+ int ret = 0;
+ int minorversion_setup;
+
+ mutex_lock(&nfs_callback_mutex);
+ if (cb_info->users++ || cb_info->task != NULL) {
+ nfs_callback_bc_serv(minorversion, xprt, cb_info);
+ goto out;
+ }
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ if (!serv) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion,
+ serv, xprt, &rqstp, &callback_svc);
+ if (!minorversion_setup) {
+ /* v4.0 callback setup */
+ rqstp = nfs4_callback_up(serv);
+ callback_svc = nfs4_callback_svc;
+ }
+
+ if (IS_ERR(rqstp)) {
+ ret = PTR_ERR(rqstp);
+ goto out_err;
+ }
+
+ svc_sock_update_bufs(serv);
+
+ sprintf(svc_name, "nfsv4.%u-svc", minorversion);
+ cb_info->serv = serv;
+ cb_info->rqst = rqstp;
+ cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+ if (IS_ERR(cb_info->task)) {
+ ret = PTR_ERR(cb_info->task);
+ svc_exit_thread(cb_info->rqst);
+ cb_info->rqst = NULL;
+ cb_info->task = NULL;
+ goto out_err;
+ }
+out:
+ /*
+ * svc_create creates the svc_serv with sv_nrthreads == 1, and then
+ * svc_prepare_thread increments that. So we need to call svc_destroy
+ * on both success and failure so that the refcount is 1 when the
+ * thread exits.
+ */
+ if (serv)
+ svc_destroy(serv);
+ mutex_unlock(&nfs_callback_mutex);
+ return ret;
+out_err:
+ dprintk("NFS: Couldn't create callback socket or server thread; "
+ "err = %d\n", ret);
+ cb_info->users--;
+ goto out;
+}
+
+/*
+ * Kill the callback thread if it's no longer being used.
+ */
+void nfs_callback_down(int minorversion)
+{
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+
+ mutex_lock(&nfs_callback_mutex);
+ cb_info->users--;
+ if (cb_info->users == 0 && cb_info->task != NULL) {
+ kthread_stop(cb_info->task);
+ svc_exit_thread(cb_info->rqst);
+ cb_info->serv = NULL;
+ cb_info->rqst = NULL;
+ cb_info->task = NULL;
+ }
+ mutex_unlock(&nfs_callback_mutex);
+}
+
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+{
+ struct rpc_clnt *r = clp->cl_rpcclient;
+ char *p = svc_gss_principal(rqstp);
+
+ if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+ return 1;
+
+ /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
+ if (clp->cl_minorversion != 0)
+ return 0;
+ /*
+ * It might just be a normal user principal, in which case
+ * userspace won't bother to tell us the name at all.
+ */
+ if (p == NULL)
+ return 0;
+
+ /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+ if (memcmp(p, "nfs@", 4) != 0)
+ return 0;
+ p += 4;
+ if (strcmp(p, r->cl_server) != 0)
+ return 0;
+ return 1;
+}
+
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
+static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+{
+ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+ if (rqstp->rq_proc != CB_NULL)
+ return SVC_DROP;
+ break;
+ case RPC_AUTH_GSS:
+ /* No RPC_AUTH_GSS support yet in NFSv4.1 */
+ if (svc_is_backchannel(rqstp))
+ return SVC_DROP;
+ }
+ return SVC_OK;
+}
+
+/*
+ * Define NFS4 callback program
+ */
+static struct svc_version *nfs4_callback_version[] = {
+ [1] = &nfs4_callback_version1,
+ [4] = &nfs4_callback_version4,
+};
+
+static struct svc_stat nfs4_callback_stats;
+
+static struct svc_program nfs4_callback_program = {
+ .pg_prog = NFS4_CALLBACK, /* RPC service number */
+ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */
+ .pg_vers = nfs4_callback_version, /* version table */
+ .pg_name = "NFSv4 callback", /* service name */
+ .pg_class = "nfs", /* authentication class */
+ .pg_stats = &nfs4_callback_stats,
+ .pg_authenticate = nfs_callback_authenticate,
+};
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
new file mode 100644
index 00000000000..c89d3b9e483
--- /dev/null
+++ b/fs/nfs/callback.h
@@ -0,0 +1,213 @@
+/*
+ * linux/fs/nfs/callback.h
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback definitions
+ */
+#ifndef __LINUX_FS_NFS_CALLBACK_H
+#define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
+
+#define NFS4_CALLBACK 0x40000000
+#define NFS4_CALLBACK_XDRSIZE 2048
+#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
+
+enum nfs4_callback_procnum {
+ CB_NULL = 0,
+ CB_COMPOUND = 1,
+};
+
+enum nfs4_callback_opnum {
+ OP_CB_GETATTR = 3,
+ OP_CB_RECALL = 4,
+/* Callback operations new to NFSv4.1 */
+ OP_CB_LAYOUTRECALL = 5,
+ OP_CB_NOTIFY = 6,
+ OP_CB_PUSH_DELEG = 7,
+ OP_CB_RECALL_ANY = 8,
+ OP_CB_RECALLABLE_OBJ_AVAIL = 9,
+ OP_CB_RECALL_SLOT = 10,
+ OP_CB_SEQUENCE = 11,
+ OP_CB_WANTS_CANCELLED = 12,
+ OP_CB_NOTIFY_LOCK = 13,
+ OP_CB_NOTIFY_DEVICEID = 14,
+ OP_CB_ILLEGAL = 10044,
+};
+
+struct cb_process_state {
+ __be32 drc_status;
+ struct nfs_client *clp;
+ int slotid;
+};
+
+struct cb_compound_hdr_arg {
+ unsigned int taglen;
+ const char *tag;
+ unsigned int minorversion;
+ unsigned int cb_ident; /* v4.0 callback identifier */
+ unsigned nops;
+};
+
+struct cb_compound_hdr_res {
+ __be32 *status;
+ unsigned int taglen;
+ const char *tag;
+ __be32 *nops;
+};
+
+struct cb_getattrargs {
+ struct sockaddr *addr;
+ struct nfs_fh fh;
+ uint32_t bitmap[2];
+};
+
+struct cb_getattrres {
+ __be32 status;
+ uint32_t bitmap[2];
+ uint64_t size;
+ uint64_t change_attr;
+ struct timespec ctime;
+ struct timespec mtime;
+};
+
+struct cb_recallargs {
+ struct sockaddr *addr;
+ struct nfs_fh fh;
+ nfs4_stateid stateid;
+ uint32_t truncate;
+};
+
+#if defined(CONFIG_NFS_V4_1)
+
+struct referring_call {
+ uint32_t rc_sequenceid;
+ uint32_t rc_slotid;
+};
+
+struct referring_call_list {
+ struct nfs4_sessionid rcl_sessionid;
+ uint32_t rcl_nrefcalls;
+ struct referring_call *rcl_refcalls;
+};
+
+struct cb_sequenceargs {
+ struct sockaddr *csa_addr;
+ struct nfs4_sessionid csa_sessionid;
+ uint32_t csa_sequenceid;
+ uint32_t csa_slotid;
+ uint32_t csa_highestslotid;
+ uint32_t csa_cachethis;
+ uint32_t csa_nrclists;
+ struct referring_call_list *csa_rclists;
+};
+
+struct cb_sequenceres {
+ __be32 csr_status;
+ struct nfs4_sessionid csr_sessionid;
+ uint32_t csr_sequenceid;
+ uint32_t csr_slotid;
+ uint32_t csr_highestslotid;
+ uint32_t csr_target_highestslotid;
+};
+
+extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+ struct cb_sequenceres *res,
+ struct cb_process_state *cps);
+
+extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
+ const nfs4_stateid *stateid);
+
+#define RCA4_TYPE_MASK_RDATA_DLG 0
+#define RCA4_TYPE_MASK_WDATA_DLG 1
+#define RCA4_TYPE_MASK_DIR_DLG 2
+#define RCA4_TYPE_MASK_FILE_LAYOUT 3
+#define RCA4_TYPE_MASK_BLK_LAYOUT 4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
+
+struct cb_recallanyargs {
+ struct sockaddr *craa_addr;
+ uint32_t craa_objs_to_keep;
+ uint32_t craa_type_mask;
+};
+
+extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
+ void *dummy,
+ struct cb_process_state *cps);
+
+struct cb_recallslotargs {
+ struct sockaddr *crsa_addr;
+ uint32_t crsa_target_max_slots;
+};
+extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
+ void *dummy,
+ struct cb_process_state *cps);
+
+struct cb_layoutrecallargs {
+ struct sockaddr *cbl_addr;
+ uint32_t cbl_recall_type;
+ uint32_t cbl_layout_type;
+ uint32_t cbl_layoutchanged;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
+};
+
+extern __be32 nfs4_callback_layoutrecall(
+ struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps);
+
+extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+struct cb_devicenotifyargs {
+ int ndevs;
+ struct cb_devicenotifyitem *devs;
+};
+
+extern __be32 nfs4_callback_devicenotify(
+ struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps);
+
+#endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+ struct cb_getattrres *res,
+ struct cb_process_state *cps);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+ struct cb_process_state *cps);
+#ifdef CONFIG_NFS_V4
+extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
+extern void nfs_callback_down(int minorversion);
+extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
+ const nfs4_stateid *stateid);
+extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
+#endif /* CONFIG_NFS_V4 */
+/*
+ * nfs41: Callbacks are expected to not cause substantial latency,
+ * so we limit their concurrency to 1 by setting up the maximum number
+ * of slots for the backchannel.
+ */
+#define NFS41_BC_MIN_CALLBACKS 1
+#define NFS41_BC_MAX_CALLBACKS 1
+
+extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
+
+#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
new file mode 100644
index 00000000000..54cea8ad5a7
--- /dev/null
+++ b/fs/nfs/callback_proc.c
@@ -0,0 +1,576 @@
+/*
+ * linux/fs/nfs/callback_proc.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback procedures
+ */
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#ifdef NFS_DEBUG
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
+
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+ struct cb_getattrres *res,
+ struct cb_process_state *cps)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_inode *nfsi;
+ struct inode *inode;
+
+ res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ res->bitmap[0] = res->bitmap[1] = 0;
+ res->status = htonl(NFS4ERR_BADHANDLE);
+
+ dprintk("NFS: GETATTR callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (inode == NULL)
+ goto out;
+ nfsi = NFS_I(inode);
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
+ goto out_iput;
+ res->size = i_size_read(inode);
+ res->change_attr = delegation->change_attr;
+ if (nfsi->npages != 0)
+ res->change_attr++;
+ res->ctime = inode->i_ctime;
+ res->mtime = inode->i_mtime;
+ res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
+ args->bitmap[0];
+ res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
+ args->bitmap[1];
+ res->status = 0;
+out_iput:
+ rcu_read_unlock();
+ iput(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
+ return res->status;
+}
+
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
+ struct cb_process_state *cps)
+{
+ struct inode *inode;
+ __be32 res;
+
+ res = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
+ goto out;
+
+ dprintk("NFS: RECALL callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ res = htonl(NFS4ERR_BADHANDLE);
+ inode = nfs_delegation_find_inode(cps->clp, &args->fh);
+ if (inode == NULL)
+ goto out;
+ /* Set up a helper thread to actually return the delegation */
+ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
+ case 0:
+ res = 0;
+ break;
+ case -ENOENT:
+ if (res != 0)
+ res = htonl(NFS4ERR_BAD_STATEID);
+ break;
+ default:
+ res = htonl(NFS4ERR_RESOURCE);
+ }
+ iput(inode);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
+ return res;
+}
+
+int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+ if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
+ sizeof(delegation->stateid.data)) != 0)
+ return 0;
+ return 1;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static u32 initiate_file_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct nfs_server *server;
+ struct pnfs_layout_hdr *lo;
+ struct inode *ino;
+ bool found = false;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ LIST_HEAD(free_me_list);
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (nfs_compare_fh(&args->cbl_fh,
+ &NFS_I(lo->plh_inode)->fh))
+ continue;
+ ino = igrab(lo->plh_inode);
+ if (!ino)
+ continue;
+ found = true;
+ /* Without this, layout can be freed as soon
+ * as we release cl_lock.
+ */
+ get_layout_hdr(lo);
+ break;
+ }
+ if (found)
+ break;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ if (!found)
+ return NFS4ERR_NOMATCHING_LAYOUT;
+
+ spin_lock(&ino->i_lock);
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ mark_matching_lsegs_invalid(lo, &free_me_list,
+ &args->cbl_range))
+ rv = NFS4ERR_DELAY;
+ else
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ put_layout_hdr(lo);
+ iput(ino);
+ return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct nfs_server *server;
+ struct pnfs_layout_hdr *lo;
+ struct inode *ino;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ struct pnfs_layout_hdr *tmp;
+ LIST_HEAD(recall_list);
+ LIST_HEAD(free_me_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if ((args->cbl_recall_type == RETURN_FSID) &&
+ memcmp(&server->fsid, &args->cbl_fsid,
+ sizeof(struct nfs_fsid)))
+ continue;
+
+ list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (!igrab(lo->plh_inode))
+ continue;
+ get_layout_hdr(lo);
+ BUG_ON(!list_empty(&lo->plh_bulk_recall));
+ list_add(&lo->plh_bulk_recall, &recall_list);
+ }
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+
+ list_for_each_entry_safe(lo, tmp,
+ &recall_list, plh_bulk_recall) {
+ ino = lo->plh_inode;
+ spin_lock(&ino->i_lock);
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
+ rv = NFS4ERR_DELAY;
+ list_del_init(&lo->plh_bulk_recall);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ put_layout_hdr(lo);
+ iput(ino);
+ }
+ return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ u32 res = NFS4ERR_DELAY;
+
+ dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+ if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+ goto out;
+ if (args->cbl_recall_type == RETURN_FILE)
+ res = initiate_file_draining(clp, args);
+ else
+ res = initiate_bulk_draining(clp, args);
+ clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+ dprintk("%s returning %i\n", __func__, res);
+ return res;
+
+}
+
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ u32 res;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (cps->clp)
+ res = do_callback_layoutrecall(cps->clp, args);
+ else
+ res = NFS4ERR_OP_NOT_IN_SESSION;
+
+ dprintk("%s: exit with status = %d\n", __func__, res);
+ return cpu_to_be32(res);
+}
+
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
+}
+
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ int i;
+ __be32 res = 0;
+ struct nfs_client *clp = cps->clp;
+ struct nfs_server *server = NULL;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (!clp) {
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!server ||
+ server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (server->pnfs_curr_ld &&
+ server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+ rcu_read_unlock();
+ goto found;
+ }
+ rcu_read_unlock();
+ dprintk("%s: layout type %u not found\n",
+ __func__, dev->cbd_layout_type);
+ continue;
+ }
+
+ found:
+ if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+ dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+ "deleting instead\n", __func__);
+ nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ }
+
+out:
+ kfree(args->devs);
+ dprintk("%s: exit with status = %u\n",
+ __func__, be32_to_cpu(res));
+ return res;
+}
+
+int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
+{
+ if (delegation == NULL)
+ return 0;
+
+ if (stateid->stateid.seqid != 0)
+ return 0;
+ if (memcmp(&delegation->stateid.stateid.other,
+ &stateid->stateid.other,
+ NFS4_STATEID_OTHER_SIZE))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Validate the sequenceID sent by the server.
+ * Return success if the sequenceID is one more than what we last saw on
+ * this slot, accounting for wraparound. Increments the slot's sequence.
+ *
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
+ * since we only currently implement idempotent callbacks anyway.
+ *
+ * We have a single slot backchannel at this time, so we don't bother
+ * checking the used_slots bit array on the table. The lower layer guarantees
+ * a single outstanding callback request at a time.
+ */
+static __be32
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+{
+ struct nfs4_slot *slot;
+
+ dprintk("%s enter. slotid %d seqid %d\n",
+ __func__, args->csa_slotid, args->csa_sequenceid);
+
+ if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+ return htonl(NFS4ERR_BADSLOT);
+
+ slot = tbl->slots + args->csa_slotid;
+ dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
+
+ /* Normal */
+ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
+ slot->seq_nr++;
+ goto out_ok;
+ }
+
+ /* Replay */
+ if (args->csa_sequenceid == slot->seq_nr) {
+ dprintk("%s seqid %d is a replay\n",
+ __func__, args->csa_sequenceid);
+ /* Signal process_op to set this error on next op */
+ if (args->csa_cachethis == 0)
+ return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ else if (args->csa_cachethis == 1)
+ return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ }
+
+ /* Wraparound */
+ if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
+ slot->seq_nr = 1;
+ goto out_ok;
+ }
+
+ /* Misordered request */
+ return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+ tbl->highest_used_slotid = args->csa_slotid;
+ return htonl(NFS4_OK);
+}
+
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match. If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+ uint32_t nrclists,
+ struct referring_call_list *rclists)
+{
+ bool status = 0;
+ int i, j;
+ struct nfs4_session *session;
+ struct nfs4_slot_table *tbl;
+ struct referring_call_list *rclist;
+ struct referring_call *ref;
+
+ /*
+ * XXX When client trunking is implemented, this becomes
+ * a session lookup from within the loop
+ */
+ session = clp->cl_session;
+ tbl = &session->fc_slot_table;
+
+ for (i = 0; i < nrclists; i++) {
+ rclist = &rclists[i];
+ if (memcmp(session->sess_id.data,
+ rclist->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+ ref = &rclist->rcl_refcalls[j];
+
+ dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+ "slotid %u\n", __func__,
+ ((u32 *)&rclist->rcl_sessionid.data)[0],
+ ((u32 *)&rclist->rcl_sessionid.data)[1],
+ ((u32 *)&rclist->rcl_sessionid.data)[2],
+ ((u32 *)&rclist->rcl_sessionid.data)[3],
+ ref->rc_sequenceid, ref->rc_slotid);
+
+ spin_lock(&tbl->slot_tbl_lock);
+ status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+ tbl->slots[ref->rc_slotid].seq_nr ==
+ ref->rc_sequenceid);
+ spin_unlock(&tbl->slot_tbl_lock);
+ if (status)
+ goto out;
+ }
+ }
+
+out:
+ return status;
+}
+
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
+ struct cb_sequenceres *res,
+ struct cb_process_state *cps)
+{
+ struct nfs4_slot_table *tbl;
+ struct nfs_client *clp;
+ int i;
+ __be32 status = htonl(NFS4ERR_BADSESSION);
+
+ clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
+ if (clp == NULL)
+ goto out;
+
+ tbl = &clp->cl_session->bc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /* state manager is resetting the session */
+ if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+ spin_unlock(&tbl->slot_tbl_lock);
+ status = htonl(NFS4ERR_DELAY);
+ /* Return NFS4ERR_BADSESSION if we're draining the session
+ * in order to reset it.
+ */
+ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+ status = htonl(NFS4ERR_BADSESSION);
+ goto out;
+ }
+
+ status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+ spin_unlock(&tbl->slot_tbl_lock);
+ if (status)
+ goto out;
+
+ cps->slotid = args->csa_slotid;
+
+ /*
+ * Check for pending referring calls. If a match is found, a
+ * related callback was received before the response to the original
+ * call.
+ */
+ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+ res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+out:
+ cps->clp = clp; /* put in nfs4_callback_compound */
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+
+ if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+ cps->drc_status = status;
+ status = 0;
+ } else
+ res->csr_status = status;
+
+ dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+ ntohl(status), ntohl(res->csr_status));
+ return status;
+}
+
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+ return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
+ struct cb_process_state *cps)
+{
+ __be32 status;
+ fmode_t flags = 0;
+
+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk("NFS: RECALL_ANY callback request from %s\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values(args->craa_type_mask))
+ goto out;
+
+ status = cpu_to_be32(NFS4_OK);
+ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
+ &args->craa_type_mask))
+ flags = FMODE_READ;
+ if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
+ &args->craa_type_mask))
+ flags |= FMODE_WRITE;
+ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+ &args->craa_type_mask))
+ pnfs_recall_all_layouts(cps->clp);
+ if (flags)
+ nfs_expire_all_delegation_types(cps->clp, flags);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
+ struct cb_process_state *cps)
+{
+ struct nfs4_slot_table *fc_tbl;
+ __be32 status;
+
+ status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ if (!cps->clp) /* set in cb_sequence */
+ goto out;
+
+ dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+ rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ args->crsa_target_max_slots);
+
+ fc_tbl = &cps->clp->cl_session->fc_slot_table;
+
+ status = htonl(NFS4ERR_BAD_HIGH_SLOT);
+ if (args->crsa_target_max_slots > fc_tbl->max_slots ||
+ args->crsa_target_max_slots < 1)
+ goto out;
+
+ status = htonl(NFS4_OK);
+ if (args->crsa_target_max_slots == fc_tbl->max_slots)
+ goto out;
+
+ fc_tbl->target_max_slots = args->crsa_target_max_slots;
+ nfs41_handle_recall_slot(cps->clp);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
new file mode 100644
index 00000000000..2b2d33b33d0
--- /dev/null
+++ b/fs/nfs/callback_xdr.c
@@ -0,0 +1,997 @@
+/*
+ * linux/fs/nfs/callback_xdr.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFSv4 callback encode/decode procedures
+ */
+#include <linux/kernel.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "internal.h"
+
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ 2 + 2 + 3 + 3)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+
+#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ 4 + 1 + 3)
+#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#endif /* CONFIG_NFS_V4_1 */
+
+#define NFSDBG_FACILITY NFSDBG_CALLBACK
+
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR 11050
+
+typedef __be32 (*callback_process_op_t)(void *, void *,
+ struct cb_process_state *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+
+
+struct callback_op {
+ callback_process_op_t process_op;
+ callback_decode_arg_t decode_args;
+ callback_encode_res_t encode_res;
+ long res_maxsize;
+};
+
+static struct callback_op callback_ops[];
+
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+ return htonl(NFS4_OK);
+}
+
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+ return xdr_argsize_check(rqstp, p);
+}
+
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, nbytes);
+ if (unlikely(p == NULL))
+ printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
+ return p;
+}
+
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+{
+ __be32 *p;
+
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ *len = ntohl(*p);
+
+ if (*len != 0) {
+ p = read_buf(xdr, *len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ *str = (const char *)p;
+ } else
+ *str = NULL;
+
+ return 0;
+}
+
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ fh->size = ntohl(*p);
+ if (fh->size > NFS4_FHSIZE)
+ return htonl(NFS4ERR_BADHANDLE);
+ p = read_buf(xdr, fh->size);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(&fh->data[0], p, fh->size);
+ memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
+ return 0;
+}
+
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+{
+ __be32 *p;
+ unsigned int attrlen;
+
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ attrlen = ntohl(*p);
+ p = read_buf(xdr, attrlen << 2);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ if (likely(attrlen > 0))
+ bitmap[0] = ntohl(*p++);
+ if (attrlen > 1)
+ bitmap[1] = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = read_buf(xdr, 16);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ memcpy(stateid->data, p, 16);
+ return 0;
+}
+
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+{
+ __be32 *p;
+ __be32 status;
+
+ status = decode_string(xdr, &hdr->taglen, &hdr->tag);
+ if (unlikely(status != 0))
+ return status;
+ /* We do not like overly long tags! */
+ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+ printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
+ __func__, hdr->taglen);
+ return htonl(NFS4ERR_RESOURCE);
+ }
+ p = read_buf(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ hdr->minorversion = ntohl(*p++);
+ /* Check minor version is zero or one. */
+ if (hdr->minorversion <= 1) {
+ hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+ } else {
+ pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
+ "illegal minor version %u!\n",
+ __func__, hdr->minorversion);
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+ }
+ hdr->nops = ntohl(*p);
+ dprintk("%s: minorversion %d nops %d\n", __func__,
+ hdr->minorversion, hdr->nops);
+ return 0;
+}
+
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+{
+ __be32 *p;
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *op = ntohl(*p);
+ return 0;
+}
+
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+{
+ __be32 status;
+
+ status = decode_fh(xdr, &args->fh);
+ if (unlikely(status != 0))
+ goto out;
+ args->addr = svc_addr(rqstp);
+ status = decode_bitmap(xdr, args->bitmap);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+{
+ __be32 *p;
+ __be32 status;
+
+ args->addr = svc_addr(rqstp);
+ status = decode_stateid(xdr, &args->stateid);
+ if (unlikely(status != 0))
+ goto out;
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_RESOURCE);
+ goto out;
+ }
+ args->truncate = ntohl(*p);
+ status = decode_fh(xdr, &args->fh);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_layoutrecallargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ uint32_t iomode;
+
+ args->cbl_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+
+ args->cbl_layout_type = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unusable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
+ args->cbl_layoutchanged = ntohl(*p++);
+ args->cbl_recall_type = ntohl(*p++);
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ args->cbl_range.iomode = iomode;
+ status = decode_fh(xdr, &args->cbl_fh);
+ if (unlikely(status != 0))
+ goto out;
+
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
+ status = decode_stateid(xdr, &args->cbl_stateid);
+ if (unlikely(status != 0))
+ goto out;
+ } else if (args->cbl_recall_type == RETURN_FSID) {
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+ } else if (args->cbl_recall_type != RETURN_ALL) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+ __func__,
+ args->cbl_layout_type, iomode,
+ args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_devicenotifyargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ u32 tmp;
+ int n, i;
+ args->ndevs = 0;
+
+ /* Num of device notifications */
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n <= 0)
+ goto out;
+ if (n > ULONG_MAX / sizeof(*args->devs)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+
+ args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto err;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ args->ndevs++;
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+out:
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+err:
+ kfree(args->devs);
+ goto out;
+}
+
+static __be32 decode_sessionid(struct xdr_stream *xdr,
+ struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+ int len = NFS4_MAX_SESSIONID_LEN;
+
+ p = read_buf(xdr, len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(sid->data, p, len);
+ return 0;
+}
+
+static __be32 decode_rc_list(struct xdr_stream *xdr,
+ struct referring_call_list *rc_list)
+{
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
+ if (status)
+ goto out;
+
+ status = htonl(NFS4ERR_RESOURCE);
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+
+ rc_list->rcl_nrefcalls = ntohl(*p++);
+ if (rc_list->rcl_nrefcalls) {
+ p = read_buf(xdr,
+ rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+ rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+ sizeof(*rc_list->rcl_refcalls),
+ GFP_KERNEL);
+ if (unlikely(rc_list->rcl_refcalls == NULL))
+ goto out;
+ for (i = 0; i < rc_list->rcl_nrefcalls; i++) {
+ rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++);
+ rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++);
+ }
+ }
+ status = 0;
+
+out:
+ return status;
+}
+
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_sequenceargs *args)
+{
+ __be32 *p;
+ int i;
+ __be32 status;
+
+ status = decode_sessionid(xdr, &args->csa_sessionid);
+ if (status)
+ goto out;
+
+ status = htonl(NFS4ERR_RESOURCE);
+ p = read_buf(xdr, 5 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ goto out;
+
+ args->csa_addr = svc_addr(rqstp);
+ args->csa_sequenceid = ntohl(*p++);
+ args->csa_slotid = ntohl(*p++);
+ args->csa_highestslotid = ntohl(*p++);
+ args->csa_cachethis = ntohl(*p++);
+ args->csa_nrclists = ntohl(*p++);
+ args->csa_rclists = NULL;
+ if (args->csa_nrclists) {
+ args->csa_rclists = kmalloc(args->csa_nrclists *
+ sizeof(*args->csa_rclists),
+ GFP_KERNEL);
+ if (unlikely(args->csa_rclists == NULL))
+ goto out;
+
+ for (i = 0; i < args->csa_nrclists; i++) {
+ status = decode_rc_list(xdr, &args->csa_rclists[i]);
+ if (status)
+ goto out_free;
+ }
+ }
+ status = 0;
+
+ dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
+ "highestslotid %u cachethis %d nrclists %u\n",
+ __func__,
+ ((u32 *)&args->csa_sessionid)[0],
+ ((u32 *)&args->csa_sessionid)[1],
+ ((u32 *)&args->csa_sessionid)[2],
+ ((u32 *)&args->csa_sessionid)[3],
+ args->csa_sequenceid, args->csa_slotid,
+ args->csa_highestslotid, args->csa_cachethis,
+ args->csa_nrclists);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+
+out_free:
+ for (i = 0; i < args->csa_nrclists; i++)
+ kfree(args->csa_rclists[i].rcl_refcalls);
+ kfree(args->csa_rclists);
+ goto out;
+}
+
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_recallanyargs *args)
+{
+ uint32_t bitmap[2];
+ __be32 *p, status;
+
+ args->craa_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->craa_objs_to_keep = ntohl(*p++);
+ status = decode_bitmap(xdr, bitmap);
+ if (unlikely(status))
+ return status;
+ args->craa_type_mask = bitmap[0];
+
+ return 0;
+}
+
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_recallslotargs *args)
+{
+ __be32 *p;
+
+ args->crsa_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
+ args->crsa_target_max_slots = ntohl(*p++);
+ return 0;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 + len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ xdr_encode_opaque(p, str, len);
+ return 0;
+}
+
+#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
+#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
+{
+ __be32 bm[2];
+ __be32 *p;
+
+ bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
+ bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
+ if (bm[1] != 0) {
+ p = xdr_reserve_space(xdr, 16);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ *p++ = htonl(2);
+ *p++ = bm[0];
+ *p++ = bm[1];
+ } else if (bm[0] != 0) {
+ p = xdr_reserve_space(xdr, 12);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ *p++ = htonl(1);
+ *p++ = bm[0];
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ *p++ = htonl(0);
+ }
+ *savep = p;
+ return 0;
+}
+
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, change);
+ return 0;
+}
+
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+{
+ __be32 *p;
+
+ if (!(bitmap[0] & FATTR4_WORD0_SIZE))
+ return 0;
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, size);
+ return 0;
+}
+
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ if (unlikely(!p))
+ return htonl(NFS4ERR_RESOURCE);
+ p = xdr_encode_hyper(p, time->tv_sec);
+ *p = htonl(time->tv_nsec);
+ return 0;
+}
+
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+{
+ __be32 status;
+
+ hdr->status = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->status == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ status = encode_string(xdr, hdr->taglen, hdr->tag);
+ if (unlikely(status != 0))
+ return status;
+ hdr->nops = xdr_reserve_space(xdr, 4);
+ if (unlikely(hdr->nops == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+ return 0;
+}
+
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE_HDR);
+ *p++ = htonl(op);
+ *p = res;
+ return 0;
+}
+
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+{
+ __be32 *savep = NULL;
+ __be32 status = res->status;
+
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_bitmap(xdr, res->bitmap, &savep);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_change(xdr, res->bitmap, res->change_attr);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_size(xdr, res->bitmap, res->size);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+ *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+
+static __be32 encode_sessionid(struct xdr_stream *xdr,
+ const struct nfs4_sessionid *sid)
+{
+ __be32 *p;
+ int len = NFS4_MAX_SESSIONID_LEN;
+
+ p = xdr_reserve_space(xdr, len);
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ memcpy(p, sid, len);
+ return 0;
+}
+
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ const struct cb_sequenceres *res)
+{
+ __be32 *p;
+ unsigned status = res->csr_status;
+
+ if (unlikely(status != 0))
+ goto out;
+
+ encode_sessionid(xdr, &res->csr_sessionid);
+
+ p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
+
+ *p++ = htonl(res->csr_sequenceid);
+ *p++ = htonl(res->csr_slotid);
+ *p++ = htonl(res->csr_highestslotid);
+ *p++ = htonl(res->csr_target_highestslotid);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ if (op_nr == OP_CB_SEQUENCE) {
+ if (nop != 0)
+ return htonl(NFS4ERR_SEQUENCE_POS);
+ } else {
+ if (nop == 0)
+ return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ }
+
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ case OP_CB_SEQUENCE:
+ case OP_CB_RECALL_ANY:
+ case OP_CB_RECALL_SLOT:
+ case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
+ *op = &callback_ops[op_nr];
+ break;
+
+ case OP_CB_NOTIFY:
+ case OP_CB_PUSH_DELEG:
+ case OP_CB_RECALLABLE_OBJ_AVAIL:
+ case OP_CB_WANTS_CANCELLED:
+ case OP_CB_NOTIFY_LOCK:
+ return htonl(NFS4ERR_NOTSUPP);
+
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static void nfs4_callback_free_slot(struct nfs4_session *session)
+{
+ struct nfs4_slot_table *tbl = &session->bc_slot_table;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ /*
+ * Let the state manager know callback processing done.
+ * A single slot, so highest used slotid is either 0 or -1
+ */
+ tbl->highest_used_slotid = -1;
+ nfs4_check_drain_bc_complete(session);
+ spin_unlock(&tbl->slot_tbl_lock);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+ if (cps->slotid != -1)
+ nfs4_callback_free_slot(cps->clp->cl_session);
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static __be32
+preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
+{
+ switch (op_nr) {
+ case OP_CB_GETATTR:
+ case OP_CB_RECALL:
+ *op = &callback_ops[op_nr];
+ break;
+ default:
+ return htonl(NFS4ERR_OP_ILLEGAL);
+ }
+
+ return htonl(NFS_OK);
+}
+
+static __be32 process_op(uint32_t minorversion, int nop,
+ struct svc_rqst *rqstp,
+ struct xdr_stream *xdr_in, void *argp,
+ struct xdr_stream *xdr_out, void *resp,
+ struct cb_process_state *cps)
+{
+ struct callback_op *op = &callback_ops[0];
+ unsigned int op_nr;
+ __be32 status;
+ long maxlen;
+ __be32 res;
+
+ dprintk("%s: start\n", __func__);
+ status = decode_op_hdr(xdr_in, &op_nr);
+ if (unlikely(status))
+ return status;
+
+ dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
+ __func__, minorversion, nop, op_nr);
+
+ status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
+ preprocess_nfs4_op(op_nr, &op);
+ if (status == htonl(NFS4ERR_OP_ILLEGAL))
+ op_nr = OP_CB_ILLEGAL;
+ if (status)
+ goto encode_hdr;
+
+ if (cps->drc_status) {
+ status = cps->drc_status;
+ goto encode_hdr;
+ }
+
+ maxlen = xdr_out->end - xdr_out->p;
+ if (maxlen > 0 && maxlen < PAGE_SIZE) {
+ status = op->decode_args(rqstp, xdr_in, argp);
+ if (likely(status == 0))
+ status = op->process_op(argp, resp, cps);
+ } else
+ status = htonl(NFS4ERR_RESOURCE);
+
+encode_hdr:
+ res = encode_op_hdr(xdr_out, op_nr, status);
+ if (unlikely(res))
+ return res;
+ if (op->encode_res != NULL && status == 0)
+ status = op->encode_res(rqstp, xdr_out, resp);
+ dprintk("%s: done, status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
+/*
+ * Decode, process and encode a COMPOUND
+ */
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+ struct cb_compound_hdr_arg hdr_arg = { 0 };
+ struct cb_compound_hdr_res hdr_res = { NULL };
+ struct xdr_stream xdr_in, xdr_out;
+ __be32 *p, status;
+ struct cb_process_state cps = {
+ .drc_status = 0,
+ .clp = NULL,
+ .slotid = -1,
+ };
+ unsigned int nops = 0;
+
+ dprintk("%s: start\n", __func__);
+
+ xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
+
+ p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+ xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+
+ status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+ if (status == __constant_htonl(NFS4ERR_RESOURCE))
+ return rpc_garbage_args;
+
+ if (hdr_arg.minorversion == 0) {
+ cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
+ if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
+ return rpc_drop_reply;
+ }
+
+ hdr_res.taglen = hdr_arg.taglen;
+ hdr_res.tag = hdr_arg.tag;
+ if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
+ return rpc_system_err;
+
+ while (status == 0 && nops != hdr_arg.nops) {
+ status = process_op(hdr_arg.minorversion, nops, rqstp,
+ &xdr_in, argp, &xdr_out, resp, &cps);
+ nops++;
+ }
+
+ /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+ * resource error in cb_compound status without returning op */
+ if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+ status = htonl(NFS4ERR_RESOURCE);
+ nops--;
+ }
+
+ *hdr_res.status = status;
+ *hdr_res.nops = htonl(nops);
+ nfs4_cb_free_slot(&cps);
+ nfs_put_client(cps.clp);
+ dprintk("%s: done, status = %u\n", __func__, ntohl(status));
+ return rpc_success;
+}
+
+/*
+ * Define NFS4 callback COMPOUND ops.
+ */
+static struct callback_op callback_ops[] = {
+ [0] = {
+ .res_maxsize = CB_OP_HDR_RES_MAXSZ,
+ },
+ [OP_CB_GETATTR] = {
+ .process_op = (callback_process_op_t)nfs4_callback_getattr,
+ .decode_args = (callback_decode_arg_t)decode_getattr_args,
+ .encode_res = (callback_encode_res_t)encode_getattr_res,
+ .res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
+ },
+ [OP_CB_RECALL] = {
+ .process_op = (callback_process_op_t)nfs4_callback_recall,
+ .decode_args = (callback_decode_arg_t)decode_recall_args,
+ .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+ },
+#if defined(CONFIG_NFS_V4_1)
+ [OP_CB_LAYOUTRECALL] = {
+ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+ .decode_args =
+ (callback_decode_arg_t)decode_layoutrecall_args,
+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+ },
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+ .decode_args =
+ (callback_decode_arg_t)decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
+ [OP_CB_SEQUENCE] = {
+ .process_op = (callback_process_op_t)nfs4_callback_sequence,
+ .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
+ .encode_res = (callback_encode_res_t)encode_cb_sequence_res,
+ .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_ANY] = {
+ .process_op = (callback_process_op_t)nfs4_callback_recallany,
+ .decode_args = (callback_decode_arg_t)decode_recallany_args,
+ .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
+ },
+ [OP_CB_RECALL_SLOT] = {
+ .process_op = (callback_process_op_t)nfs4_callback_recallslot,
+ .decode_args = (callback_decode_arg_t)decode_recallslot_args,
+ .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+ },
+#endif /* CONFIG_NFS_V4_1 */
+};
+
+/*
+ * Define NFS4 callback procedures
+ */
+static struct svc_procedure nfs4_callback_procedures1[] = {
+ [CB_NULL] = {
+ .pc_func = nfs4_callback_null,
+ .pc_decode = (kxdrproc_t)nfs4_decode_void,
+ .pc_encode = (kxdrproc_t)nfs4_encode_void,
+ .pc_xdrressize = 1,
+ },
+ [CB_COMPOUND] = {
+ .pc_func = nfs4_callback_compound,
+ .pc_encode = (kxdrproc_t)nfs4_encode_void,
+ .pc_argsize = 256,
+ .pc_ressize = 256,
+ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+ }
+};
+
+struct svc_version nfs4_callback_version1 = {
+ .vs_vers = 1,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = NULL,
+ .vs_hidden = 1,
+};
+
+struct svc_version nfs4_callback_version4 = {
+ .vs_vers = 4,
+ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+ .vs_dispatch = NULL,
+ .vs_hidden = 1,
+};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 00000000000..31778f74357
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,2019 @@
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/sunrpc/xprtrdma.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <linux/nfs_xdr.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <asm/system.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+static DEFINE_SPINLOCK(nfs_client_lock);
+static LIST_HEAD(nfs_client_list);
+static LIST_HEAD(nfs_volume_list);
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+#ifdef CONFIG_NFS_V4
+static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
+
+/*
+ * Get a unique NFSv4.0 callback identifier which will be used
+ * by the V4.0 callback service to lookup the nfs_client struct
+ */
+static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
+{
+ int ret = 0;
+
+ if (clp->rpc_ops->version != 4 || minorversion != 0)
+ return ret;
+retry:
+ if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL))
+ return -ENOMEM;
+ spin_lock(&nfs_client_lock);
+ ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident);
+ spin_unlock(&nfs_client_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+}
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static bool nfs4_disable_idmapping = true;
+
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version *nfs_version[5] = {
+ [2] = &nfs_version2,
+#ifdef CONFIG_NFS_V3
+ [3] = &nfs_version3,
+#endif
+#ifdef CONFIG_NFS_V4
+ [4] = &nfs_version4,
+#endif
+};
+
+struct rpc_program nfs_program = {
+ .name = "nfs",
+ .number = NFS_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfs_version),
+ .version = nfs_version,
+ .stats = &nfs_rpcstat,
+ .pipe_dir_name = NFS_PIPE_DIRNAME,
+};
+
+struct rpc_stat nfs_rpcstat = {
+ .program = &nfs_program
+};
+
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version * nfsacl_version[] = {
+ [3] = &nfsacl_version3,
+};
+
+struct rpc_program nfsacl_program = {
+ .name = "nfsacl",
+ .number = NFS_ACL_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfsacl_version),
+ .version = nfsacl_version,
+ .stats = &nfsacl_rpcstat,
+};
+#endif /* CONFIG_NFS_V3_ACL */
+
+struct nfs_client_initdata {
+ const char *hostname;
+ const struct sockaddr *addr;
+ size_t addrlen;
+ const struct nfs_rpc_ops *rpc_ops;
+ int proto;
+ u32 minorversion;
+};
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
+{
+ struct nfs_client *clp;
+ struct rpc_cred *cred;
+ int err = -ENOMEM;
+
+ if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+ goto error_0;
+
+ clp->rpc_ops = cl_init->rpc_ops;
+
+ atomic_set(&clp->cl_count, 1);
+ clp->cl_cons_state = NFS_CS_INITING;
+
+ memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+ clp->cl_addrlen = cl_init->addrlen;
+
+ if (cl_init->hostname) {
+ err = -ENOMEM;
+ clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
+ if (!clp->cl_hostname)
+ goto error_cleanup;
+ }
+
+ INIT_LIST_HEAD(&clp->cl_superblocks);
+ clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+ clp->cl_proto = cl_init->proto;
+
+#ifdef CONFIG_NFS_V4
+ err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
+ if (err)
+ goto error_cleanup;
+
+ spin_lock_init(&clp->cl_lock);
+ INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
+ rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+ clp->cl_boot_time = CURRENT_TIME;
+ clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+ clp->cl_minorversion = cl_init->minorversion;
+ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+#endif
+ cred = rpc_lookup_machine_cred("*");
+ if (!IS_ERR(cred))
+ clp->cl_machine_cred = cred;
+ nfs_fscache_get_client_cookie(clp);
+
+ return clp;
+
+error_cleanup:
+ kfree(clp);
+error_0:
+ return ERR_PTR(err);
+}
+
+#ifdef CONFIG_NFS_V4
+#ifdef CONFIG_NFS_V4_1
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+ if (nfs4_has_session(clp))
+ nfs4_destroy_session(clp->cl_session);
+}
+#else /* CONFIG_NFS_V4_1 */
+static void nfs4_shutdown_session(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+ nfs_callback_down(clp->cl_mvops->minor_version);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+ if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+ nfs4_kill_renewd(clp);
+ nfs4_shutdown_session(clp);
+ nfs4_destroy_callback(clp);
+ if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+ nfs_idmap_delete(clp);
+
+ rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
+}
+
+/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
+void nfs_cleanup_cb_ident_idr(void)
+{
+ idr_destroy(&cb_ident_idr);
+}
+
+/* nfs_client_lock held */
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+ if (clp->cl_cb_ident)
+ idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+ rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+ nfs4_purge_state_owners(server);
+}
+
+#else
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+}
+
+void nfs_cleanup_cb_ident_idr(void)
+{
+}
+
+static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
+{
+}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Destroy a shared client record
+ */
+static void nfs_free_client(struct nfs_client *clp)
+{
+ dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
+
+ nfs4_shutdown_client(clp);
+
+ nfs_fscache_release_client_cookie(clp);
+
+ /* -EIO all pending I/O */
+ if (!IS_ERR(clp->cl_rpcclient))
+ rpc_shutdown_client(clp->cl_rpcclient);
+
+ if (clp->cl_machine_cred != NULL)
+ put_rpccred(clp->cl_machine_cred);
+
+ nfs4_deviceid_purge_client(clp);
+
+ kfree(clp->cl_hostname);
+ kfree(clp->server_scope);
+ kfree(clp);
+
+ dprintk("<-- nfs_free_client()\n");
+}
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+ if (!clp)
+ return;
+
+ dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+
+ if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+ list_del(&clp->cl_share_link);
+ nfs_cb_idr_remove_locked(clp);
+ spin_unlock(&nfs_client_lock);
+
+ BUG_ON(!list_empty(&clp->cl_superblocks));
+
+ nfs_free_client(clp);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_put_client);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+ if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+ return 0;
+ else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ return sin1->sin6_scope_id == sin2->sin6_scope_id;
+
+ return 1;
+}
+#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+ return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+ (sin1->sin6_port == sin2->sin6_port);
+}
+
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+ (sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ if (sa1->sa_family != sa2->sa_family)
+ return 0;
+
+ switch (sa1->sa_family) {
+ case AF_INET:
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+ case AF_INET6:
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+ }
+ return 0;
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, including the port number.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ if (sa1->sa_family != sa2->sa_family)
+ return 0;
+
+ switch (sa1->sa_family) {
+ case AF_INET:
+ return nfs_sockaddr_cmp_ip4(sa1, sa2);
+ case AF_INET6:
+ return nfs_sockaddr_cmp_ip6(sa1, sa2);
+ }
+ return 0;
+}
+
+/* Common match routine for v4.0 and v4.1 callback services */
+bool
+nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
+ u32 minorversion)
+{
+ struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+ /* Don't match clients that failed to initialise */
+ if (!(clp->cl_cons_state == NFS_CS_READY ||
+ clp->cl_cons_state == NFS_CS_SESSION_INITING))
+ return false;
+
+ /* Match the version and minorversion */
+ if (clp->rpc_ops->version != 4 ||
+ clp->cl_minorversion != minorversion)
+ return false;
+
+ /* Match only the IP address, not the port number */
+ if (!nfs_sockaddr_match_ipaddr(addr, clap))
+ return false;
+
+ return true;
+}
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+ struct nfs_client *clp;
+ const struct sockaddr *sap = data->addr;
+
+ list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+ /* Don't match clients that failed to initialise properly */
+ if (clp->cl_cons_state < 0)
+ continue;
+
+ /* Different NFS versions cannot share the same nfs_client */
+ if (clp->rpc_ops != data->rpc_ops)
+ continue;
+
+ if (clp->cl_proto != data->proto)
+ continue;
+ /* Match nfsv4 minorversion */
+ if (clp->cl_minorversion != data->minorversion)
+ continue;
+ /* Match the full socket address */
+ if (!nfs_sockaddr_cmp(sap, clap))
+ continue;
+
+ atomic_inc(&clp->cl_count);
+ return clp;
+ }
+ return NULL;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
+{
+ struct nfs_client *clp, *new = NULL;
+ int error;
+
+ dprintk("--> nfs_get_client(%s,v%u)\n",
+ cl_init->hostname ?: "", cl_init->rpc_ops->version);
+
+ /* see if the client already exists */
+ do {
+ spin_lock(&nfs_client_lock);
+
+ clp = nfs_match_client(cl_init);
+ if (clp)
+ goto found_client;
+ if (new)
+ goto install_client;
+
+ spin_unlock(&nfs_client_lock);
+
+ new = nfs_alloc_client(cl_init);
+ } while (!IS_ERR(new));
+
+ dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new));
+ return new;
+
+ /* install a new client and return with it unready */
+install_client:
+ clp = new;
+ list_add(&clp->cl_share_link, &nfs_client_list);
+ spin_unlock(&nfs_client_lock);
+
+ error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+ authflavour, noresvport);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
+ dprintk("--> nfs_get_client() = %p [new]\n", clp);
+ return clp;
+
+ /* found an existing client
+ * - make sure it's ready before returning
+ */
+found_client:
+ spin_unlock(&nfs_client_lock);
+
+ if (new)
+ nfs_free_client(new);
+
+ error = wait_event_killable(nfs_client_active_wq,
+ clp->cl_cons_state < NFS_CS_INITING);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(-ERESTARTSYS);
+ }
+
+ if (clp->cl_cons_state < NFS_CS_READY) {
+ error = clp->cl_cons_state;
+ nfs_put_client(clp);
+ return ERR_PTR(error);
+ }
+
+ BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+
+ dprintk("--> nfs_get_client() = %p [share]\n", clp);
+ return clp;
+}
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+ clp->cl_cons_state = state;
+ wake_up_all(&nfs_client_active_wq);
+}
+
+/*
+ * With sessions, the client is not marked ready until after a
+ * successful EXCHANGE_ID and CREATE_SESSION.
+ *
+ * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
+ * other versions of NFS can be tried.
+ */
+int nfs4_check_client_ready(struct nfs_client *clp)
+{
+ if (!nfs4_has_session(clp))
+ return 0;
+ if (clp->cl_cons_state < NFS_CS_READY)
+ return -EPROTONOSUPPORT;
+ return 0;
+}
+
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+ unsigned int timeo, unsigned int retrans)
+{
+ to->to_initval = timeo * HZ / 10;
+ to->to_retries = retrans;
+
+ switch (proto) {
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
+ if (to->to_initval == 0)
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+ to->to_initval = NFS_MAX_TCP_TIMEOUT;
+ to->to_increment = to->to_initval;
+ to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+ if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+ to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+ if (to->to_maxval < to->to_initval)
+ to->to_maxval = to->to_initval;
+ to->to_exponential = 0;
+ break;
+ case XPRT_TRANSPORT_UDP:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
+ if (!to->to_initval)
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
+ if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+ to->to_initval = NFS_MAX_UDP_TIMEOUT;
+ to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+ to->to_exponential = 1;
+ break;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Create an RPC client handle
+ */
+static int nfs_create_rpc_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ rpc_authflavor_t flavor,
+ int discrtry, int noresvport)
+{
+ struct rpc_clnt *clnt = NULL;
+ struct rpc_create_args args = {
+ .net = &init_net,
+ .protocol = clp->cl_proto,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrsize = clp->cl_addrlen,
+ .timeout = timeparms,
+ .servername = clp->cl_hostname,
+ .program = &nfs_program,
+ .version = clp->rpc_ops->version,
+ .authflavor = flavor,
+ };
+
+ if (discrtry)
+ args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+ if (noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ if (!IS_ERR(clp->cl_rpcclient))
+ return 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("%s: cannot create RPC client. Error = %ld\n",
+ __func__, PTR_ERR(clnt));
+ return PTR_ERR(clnt);
+ }
+
+ clp->cl_rpcclient = clnt;
+ return 0;
+}
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+ if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) ||
+ !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
+ nlmclnt_done(server->nlm_host);
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+ struct nlm_host *host;
+ struct nfs_client *clp = server->nfs_client;
+ struct nlmclnt_initdata nlm_init = {
+ .hostname = clp->cl_hostname,
+ .address = (struct sockaddr *)&clp->cl_addr,
+ .addrlen = clp->cl_addrlen,
+ .nfs_version = clp->rpc_ops->version,
+ .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
+ 1 : 0,
+ };
+
+ if (nlm_init.nfs_version > 3)
+ return 0;
+ if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) &&
+ (server->flags & NFS_MOUNT_LOCAL_FCNTL))
+ return 0;
+
+ switch (clp->cl_proto) {
+ default:
+ nlm_init.protocol = IPPROTO_TCP;
+ break;
+ case XPRT_TRANSPORT_UDP:
+ nlm_init.protocol = IPPROTO_UDP;
+ }
+
+ host = nlmclnt_init(&nlm_init);
+ if (IS_ERR(host))
+ return PTR_ERR(host);
+
+ server->nlm_host = host;
+ server->destroy = nfs_destroy_server;
+ return 0;
+}
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+#ifdef CONFIG_NFS_V3_ACL
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ if (server->nfs_client->rpc_ops->version != 3)
+ goto out_noacl;
+ if (server->flags & NFS_MOUNT_NOACL)
+ goto out_noacl;
+
+ server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+ if (IS_ERR(server->client_acl))
+ goto out_noacl;
+
+ /* No errors! Assume that Sun nfsacls are supported */
+ server->caps |= NFS_CAP_ACLS;
+ return;
+
+out_noacl:
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+ server->flags &= ~NFS_MOUNT_NOACL;
+ server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+/*
+ * Create a general RPC client
+ */
+static int nfs_init_server_rpcclient(struct nfs_server *server,
+ const struct rpc_timeout *timeo,
+ rpc_authflavor_t pseudoflavour)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ server->client = rpc_clone_client(clp->cl_rpcclient);
+ if (IS_ERR(server->client)) {
+ dprintk("%s: couldn't create rpc_client!\n", __func__);
+ return PTR_ERR(server->client);
+ }
+
+ memcpy(&server->client->cl_timeout_default,
+ timeo,
+ sizeof(server->client->cl_timeout_default));
+ server->client->cl_timeout = &server->client->cl_timeout_default;
+
+ if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
+ struct rpc_auth *auth;
+
+ auth = rpcauth_create(pseudoflavour, server->client);
+ if (IS_ERR(auth)) {
+ dprintk("%s: couldn't create credcache!\n", __func__);
+ return PTR_ERR(auth);
+ }
+ }
+ server->client->cl_softrtry = 0;
+ if (server->flags & NFS_MOUNT_SOFT)
+ server->client->cl_softrtry = 1;
+
+ return 0;
+}
+
+/*
+ * Initialise an NFS2 or NFS3 client
+ */
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport)
+{
+ int error;
+
+ if (clp->cl_cons_state == NFS_CS_READY) {
+ /* the client is already initialised */
+ dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+ return 0;
+ }
+
+ /*
+ * Create a client RPC handle for doing FSSTAT with UNIX auth only
+ * - RFC 2623, sec 2.3.2
+ */
+ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+ 0, noresvport);
+ if (error < 0)
+ goto error;
+ nfs_mark_client_ready(clp, NFS_CS_READY);
+ return 0;
+
+error:
+ nfs_mark_client_ready(clp, error);
+ dprintk("<-- nfs_init_client() = xerror %d\n", error);
+ return error;
+}
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server,
+ const struct nfs_parsed_mount_data *data)
+{
+ struct nfs_client_initdata cl_init = {
+ .hostname = data->nfs_server.hostname,
+ .addr = (const struct sockaddr *)&data->nfs_server.address,
+ .addrlen = data->nfs_server.addrlen,
+ .rpc_ops = &nfs_v2_clientops,
+ .proto = data->nfs_server.protocol,
+ };
+ struct rpc_timeout timeparms;
+ struct nfs_client *clp;
+ int error;
+
+ dprintk("--> nfs_init_server()\n");
+
+#ifdef CONFIG_NFS_V3
+ if (data->version == 3)
+ cl_init.rpc_ops = &nfs_v3_clientops;
+#endif
+
+ nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+ data->timeo, data->retrans);
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+ data->flags & NFS_MOUNT_NORESVPORT);
+ if (IS_ERR(clp)) {
+ dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+ return PTR_ERR(clp);
+ }
+
+ server->nfs_client = clp;
+
+ /* Initialise the client representation from the mount data */
+ server->flags = data->flags;
+ server->options = data->options;
+ server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+
+ if (data->rsize)
+ server->rsize = nfs_block_size(data->rsize, NULL);
+ if (data->wsize)
+ server->wsize = nfs_block_size(data->wsize, NULL);
+
+ server->acregmin = data->acregmin * HZ;
+ server->acregmax = data->acregmax * HZ;
+ server->acdirmin = data->acdirmin * HZ;
+ server->acdirmax = data->acdirmax * HZ;
+
+ /* Start lockd here, before we might error out */
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto error;
+
+ server->port = data->nfs_server.port;
+
+ error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+ if (error < 0)
+ goto error;
+
+ /* Preserve the values of mount_server-related mount options */
+ if (data->mount_server.addrlen) {
+ memcpy(&server->mountd_address, &data->mount_server.address,
+ data->mount_server.addrlen);
+ server->mountd_addrlen = data->mount_server.addrlen;
+ }
+ server->mountd_version = data->mount_server.version;
+ server->mountd_port = data->mount_server.port;
+ server->mountd_protocol = data->mount_server.protocol;
+
+ server->namelen = data->namlen;
+ /* Create a client RPC handle for the NFSv3 ACL management interface */
+ nfs_init_server_aclclient(server);
+ dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
+ return 0;
+
+error:
+ server->nfs_client = NULL;
+ nfs_put_client(clp);
+ dprintk("<-- nfs_init_server() = xerror %d\n", error);
+ return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+ struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
+{
+ unsigned long max_rpc_payload;
+
+ /* Work out a lot of parameters */
+ if (server->rsize == 0)
+ server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+ if (server->wsize == 0)
+ server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+
+ if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+ server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+ if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+ server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+
+ max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+ if (server->rsize > max_rpc_payload)
+ server->rsize = max_rpc_payload;
+ if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+ server->rsize = NFS_MAX_FILE_IO_SIZE;
+ server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ server->backing_dev_info.name = "nfs";
+ server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+ if (server->wsize > max_rpc_payload)
+ server->wsize = max_rpc_payload;
+ if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+ server->wsize = NFS_MAX_FILE_IO_SIZE;
+ server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->pnfs_blksize = fsinfo->blksize;
+ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
+
+ server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+ server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+ if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
+ server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+ if (server->dtsize > server->rsize)
+ server->dtsize = server->rsize;
+
+ if (server->flags & NFS_MOUNT_NOAC) {
+ server->acregmin = server->acregmax = 0;
+ server->acdirmin = server->acdirmax = 0;
+ }
+
+ server->maxfilesize = fsinfo->maxfilesize;
+
+ server->time_delta = fsinfo->time_delta;
+
+ /* We're airborne Set socket buffersize */
+ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+ struct nfs_fsinfo fsinfo;
+ struct nfs_client *clp = server->nfs_client;
+ int error;
+
+ dprintk("--> nfs_probe_fsinfo()\n");
+
+ if (clp->rpc_ops->set_capabilities != NULL) {
+ error = clp->rpc_ops->set_capabilities(server, mntfh);
+ if (error < 0)
+ goto out_error;
+ }
+
+ fsinfo.fattr = fattr;
+ fsinfo.layouttype = 0;
+ error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+ if (error < 0)
+ goto out_error;
+
+ nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+
+ /* Get some general file system info */
+ if (server->namelen == 0) {
+ struct nfs_pathconf pathinfo;
+
+ pathinfo.fattr = fattr;
+ nfs_fattr_init(fattr);
+
+ if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+ server->namelen = pathinfo.max_namelen;
+ }
+
+ dprintk("<-- nfs_probe_fsinfo() = 0\n");
+ return 0;
+
+out_error:
+ dprintk("nfs_probe_fsinfo: error = %d\n", -error);
+ return error;
+}
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+ target->flags = source->flags;
+ target->rsize = source->rsize;
+ target->wsize = source->wsize;
+ target->acregmin = source->acregmin;
+ target->acregmax = source->acregmax;
+ target->acdirmin = source->acdirmin;
+ target->acdirmax = source->acdirmax;
+ target->caps = source->caps;
+ target->options = source->options;
+}
+
+static void nfs_server_insert_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&nfs_client_lock);
+ list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
+ list_add_tail(&server->master_link, &nfs_volume_list);
+ clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ spin_unlock(&nfs_client_lock);
+
+}
+
+static void nfs_server_remove_lists(struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ spin_lock(&nfs_client_lock);
+ list_del_rcu(&server->client_link);
+ if (clp && list_empty(&clp->cl_superblocks))
+ set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
+ list_del(&server->master_link);
+ spin_unlock(&nfs_client_lock);
+
+ synchronize_rcu();
+}
+
+/*
+ * Allocate and initialise a server record
+ */
+static struct nfs_server *nfs_alloc_server(void)
+{
+ struct nfs_server *server;
+
+ server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+ if (!server)
+ return NULL;
+
+ server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+ /* Zero out the NFS state stuff */
+ INIT_LIST_HEAD(&server->client_link);
+ INIT_LIST_HEAD(&server->master_link);
+ INIT_LIST_HEAD(&server->delegations);
+ INIT_LIST_HEAD(&server->layouts);
+ INIT_LIST_HEAD(&server->state_owners_lru);
+
+ atomic_set(&server->active, 0);
+
+ server->io_stats = nfs_alloc_iostats();
+ if (!server->io_stats) {
+ kfree(server);
+ return NULL;
+ }
+
+ if (bdi_init(&server->backing_dev_info)) {
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+ return NULL;
+ }
+
+ pnfs_init_server(server);
+
+ return server;
+}
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+ dprintk("--> nfs_free_server()\n");
+
+ nfs_server_remove_lists(server);
+ unset_pnfs_layoutdriver(server);
+
+ if (server->destroy != NULL)
+ server->destroy(server);
+
+ if (!IS_ERR(server->client_acl))
+ rpc_shutdown_client(server->client_acl);
+ if (!IS_ERR(server->client))
+ rpc_shutdown_client(server->client);
+
+ nfs_put_client(server->nfs_client);
+
+ nfs_free_iostats(server->io_stats);
+ bdi_destroy(&server->backing_dev_info);
+ kfree(server);
+ nfs_release_automount_timer();
+ dprintk("<-- nfs_free_server()\n");
+}
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
+ struct nfs_fh *mntfh)
+{
+ struct nfs_server *server;
+ struct nfs_fattr *fattr;
+ int error;
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
+ /* Get a client representation */
+ error = nfs_init_server(server, data);
+ if (error < 0)
+ goto error;
+
+ BUG_ON(!server->nfs_client);
+ BUG_ON(!server->nfs_client->rpc_ops);
+ BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+ /* Probe the root fh to retrieve its FSID */
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+ if (error < 0)
+ goto error;
+ if (server->nfs_client->rpc_ops->version == 3) {
+ if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+ server->namelen = NFS3_MAXNAMLEN;
+ if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+ server->caps |= NFS_CAP_READDIRPLUS;
+ } else {
+ if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+ server->namelen = NFS2_MAXNAMLEN;
+ }
+
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
+ if (error < 0) {
+ dprintk("nfs_create_server: getattr error = %d\n", -error);
+ goto error;
+ }
+ }
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ nfs_free_fattr(fattr);
+ return server;
+
+error:
+ nfs_free_fattr(fattr);
+ nfs_free_server(server);
+ return ERR_PTR(error);
+}
+
+#ifdef CONFIG_NFS_V4
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by IP address, protocol version, and minorversion
+ *
+ * Called from the pg_authenticate method. The callback identifier
+ * is not used as it has not been decoded.
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_no_ident(const struct sockaddr *addr)
+{
+ struct nfs_client *clp;
+
+ spin_lock(&nfs_client_lock);
+ list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ if (nfs4_cb_match_client(addr, clp, 0) == false)
+ continue;
+ atomic_inc(&clp->cl_count);
+ spin_unlock(&nfs_client_lock);
+ return clp;
+ }
+ spin_unlock(&nfs_client_lock);
+ return NULL;
+}
+
+/*
+ * NFSv4.0 callback thread helper
+ *
+ * Find a client by callback identifier
+ */
+struct nfs_client *
+nfs4_find_client_ident(int cb_ident)
+{
+ struct nfs_client *clp;
+
+ spin_lock(&nfs_client_lock);
+ clp = idr_find(&cb_ident_idr, cb_ident);
+ if (clp)
+ atomic_inc(&clp->cl_count);
+ spin_unlock(&nfs_client_lock);
+ return clp;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * NFSv4.1 callback thread helper
+ * For CB_COMPOUND calls, find a client by IP address, protocol version,
+ * minorversion, and sessionID
+ *
+ * Returns NULL if no such client
+ */
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+ struct nfs4_sessionid *sid)
+{
+ struct nfs_client *clp;
+
+ spin_lock(&nfs_client_lock);
+ list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ if (nfs4_cb_match_client(addr, clp, 1) == false)
+ continue;
+
+ if (!nfs4_has_session(clp))
+ continue;
+
+ /* Match sessionid*/
+ if (memcmp(clp->cl_session->sess_id.data,
+ sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
+ continue;
+
+ atomic_inc(&clp->cl_count);
+ spin_unlock(&nfs_client_lock);
+ return clp;
+ }
+ spin_unlock(&nfs_client_lock);
+ return NULL;
+}
+
+#else /* CONFIG_NFS_V4_1 */
+
+struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *addr,
+ struct nfs4_sessionid *sid)
+{
+ return NULL;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Initialize the NFS4 callback service
+ */
+static int nfs4_init_callback(struct nfs_client *clp)
+{
+ int error;
+
+ if (clp->rpc_ops->version == 4) {
+ if (nfs4_has_session(clp)) {
+ error = xprt_setup_backchannel(
+ clp->cl_rpcclient->cl_xprt,
+ NFS41_BC_MIN_CALLBACKS);
+ if (error < 0)
+ return error;
+ }
+
+ error = nfs_callback_up(clp->cl_mvops->minor_version,
+ clp->cl_rpcclient->cl_xprt);
+ if (error < 0) {
+ dprintk("%s: failed to start callback. Error = %d\n",
+ __func__, error);
+ return error;
+ }
+ __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+ }
+ return 0;
+}
+
+/*
+ * Initialize the minor version specific parts of an NFS4 client record
+ */
+static int nfs4_init_client_minor_version(struct nfs_client *clp)
+{
+#if defined(CONFIG_NFS_V4_1)
+ if (clp->cl_mvops->minor_version) {
+ struct nfs4_session *session = NULL;
+ /*
+ * Create the session and mark it expired.
+ * When a SEQUENCE operation encounters the expired session
+ * it will do session recovery to initialize it.
+ */
+ session = nfs4_alloc_session(clp);
+ if (!session)
+ return -ENOMEM;
+
+ clp->cl_session = session;
+ /*
+ * The create session reply races with the server back
+ * channel probe. Mark the client NFS_CS_SESSION_INITING
+ * so that the client back channel can find the
+ * nfs_client struct
+ */
+ clp->cl_cons_state = NFS_CS_SESSION_INITING;
+ }
+#endif /* CONFIG_NFS_V4_1 */
+
+ return nfs4_init_callback(clp);
+}
+
+/*
+ * Initialise an NFS4 client record
+ */
+int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport)
+{
+ int error;
+
+ if (clp->cl_cons_state == NFS_CS_READY) {
+ /* the client is initialised already */
+ dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
+ return 0;
+ }
+
+ /* Check NFS protocol revision and initialize RPC op vector */
+ clp->rpc_ops = &nfs_v4_clientops;
+
+ error = nfs_create_rpc_client(clp, timeparms, authflavour,
+ 1, noresvport);
+ if (error < 0)
+ goto error;
+ strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+
+ error = nfs_idmap_new(clp);
+ if (error < 0) {
+ dprintk("%s: failed to create idmapper. Error = %d\n",
+ __func__, error);
+ goto error;
+ }
+ __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+
+ error = nfs4_init_client_minor_version(clp);
+ if (error < 0)
+ goto error;
+
+ if (!nfs4_has_session(clp))
+ nfs_mark_client_ready(clp, NFS_CS_READY);
+ return 0;
+
+error:
+ nfs_mark_client_ready(clp, error);
+ dprintk("<-- nfs4_init_client() = xerror %d\n", error);
+ return error;
+}
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+ const char *hostname,
+ const struct sockaddr *addr,
+ const size_t addrlen,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int proto, const struct rpc_timeout *timeparms,
+ u32 minorversion)
+{
+ struct nfs_client_initdata cl_init = {
+ .hostname = hostname,
+ .addr = addr,
+ .addrlen = addrlen,
+ .rpc_ops = &nfs_v4_clientops,
+ .proto = proto,
+ .minorversion = minorversion,
+ };
+ struct nfs_client *clp;
+ int error;
+
+ dprintk("--> nfs4_set_client()\n");
+
+ /* Allocate or find a client reference we can use */
+ clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+ server->flags & NFS_MOUNT_NORESVPORT);
+ if (IS_ERR(clp)) {
+ error = PTR_ERR(clp);
+ goto error;
+ }
+
+ /*
+ * Query for the lease time on clientid setup or renewal
+ *
+ * Note that this will be set on nfs_clients that were created
+ * only for the DS role and did not set this bit, but now will
+ * serve a dual role.
+ */
+ set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
+ server->nfs_client = clp;
+ dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
+ return 0;
+error:
+ dprintk("<-- nfs4_set_client() = xerror %d\n", error);
+ return error;
+}
+
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto)
+{
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .rpc_ops = &nfs_v4_clientops,
+ .proto = ds_proto,
+ .minorversion = mds_clp->cl_minorversion,
+ };
+ struct rpc_timeout ds_timeout = {
+ .to_initval = 15 * HZ,
+ .to_maxval = 15 * HZ,
+ .to_retries = 1,
+ .to_exponential = 1,
+ };
+ struct nfs_client *clp;
+
+ /*
+ * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+ * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+ * (section 13.1 RFC 5661).
+ */
+ clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+ mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+ dprintk("<-- %s %p\n", __func__, clp);
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
+
+/*
+ * Session has been established, and the client marked ready.
+ * Set the mount rsize and wsize with negotiated fore channel
+ * attributes which will be bound checked in nfs_server_set_fsinfo.
+ */
+static void nfs4_session_set_rwsize(struct nfs_server *server)
+{
+#ifdef CONFIG_NFS_V4_1
+ struct nfs4_session *sess;
+ u32 server_resp_sz;
+ u32 server_rqst_sz;
+
+ if (!nfs4_has_session(server->nfs_client))
+ return;
+ sess = server->nfs_client->cl_session;
+ server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead;
+ server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead;
+
+ if (server->rsize > server_resp_sz)
+ server->rsize = server_resp_sz;
+ if (server->wsize > server_rqst_sz)
+ server->wsize = server_rqst_sz;
+#endif /* CONFIG_NFS_V4_1 */
+}
+
+static int nfs4_server_common_setup(struct nfs_server *server,
+ struct nfs_fh *mntfh)
+{
+ struct nfs_fattr *fattr;
+ int error;
+
+ BUG_ON(!server->nfs_client);
+ BUG_ON(!server->nfs_client->rpc_ops);
+ BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+ /* data servers support only a subset of NFSv4.1 */
+ if (is_ds_only_client(server->nfs_client))
+ return -EPROTONOSUPPORT;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* We must ensure the session is initialised first */
+ error = nfs4_init_session(server);
+ if (error < 0)
+ goto out;
+
+ /* Probe the root fh to retrieve its FSID and filehandle */
+ error = nfs4_get_rootfh(server, mntfh);
+ if (error < 0)
+ goto out;
+
+ dprintk("Server FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+ dprintk("Mount FH: %d\n", mntfh->size);
+
+ nfs4_session_set_rwsize(server);
+
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
+ if (error < 0)
+ goto out;
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+ server->destroy = nfs4_destroy_server;
+out:
+ nfs_free_fattr(fattr);
+ return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server,
+ const struct nfs_parsed_mount_data *data)
+{
+ struct rpc_timeout timeparms;
+ int error;
+
+ dprintk("--> nfs4_init_server()\n");
+
+ nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+ data->timeo, data->retrans);
+
+ /* Initialise the client representation from the mount data */
+ server->flags = data->flags;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
+ if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+ server->caps |= NFS_CAP_READDIRPLUS;
+ server->options = data->options;
+
+ /* Get a client record */
+ error = nfs4_set_client(server,
+ data->nfs_server.hostname,
+ (const struct sockaddr *)&data->nfs_server.address,
+ data->nfs_server.addrlen,
+ data->client_address,
+ data->auth_flavors[0],
+ data->nfs_server.protocol,
+ &timeparms,
+ data->minorversion);
+ if (error < 0)
+ goto error;
+
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+ if (data->rsize)
+ server->rsize = nfs_block_size(data->rsize, NULL);
+ if (data->wsize)
+ server->wsize = nfs_block_size(data->wsize, NULL);
+
+ server->acregmin = data->acregmin * HZ;
+ server->acregmax = data->acregmax * HZ;
+ server->acdirmin = data->acdirmin * HZ;
+ server->acdirmax = data->acdirmax * HZ;
+
+ server->port = data->nfs_server.port;
+
+ error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+
+error:
+ /* Done */
+ dprintk("<-- nfs4_init_server() = %d\n", error);
+ return error;
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
+ struct nfs_fh *mntfh)
+{
+ struct nfs_server *server;
+ int error;
+
+ dprintk("--> nfs4_create_server()\n");
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ /* set up the general RPC client */
+ error = nfs4_init_server(server, data);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_server_common_setup(server, mntfh);
+ if (error < 0)
+ goto error;
+
+ dprintk("<-- nfs4_create_server() = %p\n", server);
+ return server;
+
+error:
+ nfs_free_server(server);
+ dprintk("<-- nfs4_create_server() = error %d\n", error);
+ return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
+ struct nfs_fh *mntfh)
+{
+ struct nfs_client *parent_client;
+ struct nfs_server *server, *parent_server;
+ int error;
+
+ dprintk("--> nfs4_create_referral_server()\n");
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ parent_server = NFS_SB(data->sb);
+ parent_client = parent_server->nfs_client;
+
+ /* Initialise the client representation from the parent server */
+ nfs_server_copy_userdata(server, parent_server);
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+
+ /* Get a client representation.
+ * Note: NFSv4 always uses TCP, */
+ error = nfs4_set_client(server, data->hostname,
+ data->addr,
+ data->addrlen,
+ parent_client->cl_ipaddr,
+ data->authflavor,
+ parent_server->client->cl_xprt->prot,
+ parent_server->client->cl_timeout,
+ parent_client->cl_mvops->minor_version);
+ if (error < 0)
+ goto error;
+
+ error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
+ if (error < 0)
+ goto error;
+
+ error = nfs4_server_common_setup(server, mntfh);
+ if (error < 0)
+ goto error;
+
+ dprintk("<-- nfs_create_referral_server() = %p\n", server);
+ return server;
+
+error:
+ nfs_free_server(server);
+ dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
+ return ERR_PTR(error);
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_server *server;
+ struct nfs_fattr *fattr_fsinfo;
+ int error;
+
+ dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
+ (unsigned long long) fattr->fsid.major,
+ (unsigned long long) fattr->fsid.minor);
+
+ server = nfs_alloc_server();
+ if (!server)
+ return ERR_PTR(-ENOMEM);
+
+ error = -ENOMEM;
+ fattr_fsinfo = nfs_alloc_fattr();
+ if (fattr_fsinfo == NULL)
+ goto out_free_server;
+
+ /* Copy data from the source */
+ server->nfs_client = source->nfs_client;
+ server->destroy = source->destroy;
+ atomic_inc(&server->nfs_client->cl_count);
+ nfs_server_copy_userdata(server, source);
+
+ server->fsid = fattr->fsid;
+
+ error = nfs_init_server_rpcclient(server,
+ source->client->cl_timeout,
+ source->client->cl_auth->au_flavor);
+ if (error < 0)
+ goto out_free_server;
+ if (!IS_ERR(source->client_acl))
+ nfs_init_server_aclclient(server);
+
+ /* probe the filesystem info for this server filesystem */
+ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
+ if (error < 0)
+ goto out_free_server;
+
+ if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+ server->namelen = NFS4_MAXNAMLEN;
+
+ dprintk("Cloned FSID: %llx:%llx\n",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ error = nfs_start_lockd(server);
+ if (error < 0)
+ goto out_free_server;
+
+ nfs_server_insert_lists(server);
+ server->mount_time = jiffies;
+
+ nfs_free_fattr(fattr_fsinfo);
+ dprintk("<-- nfs_clone_server() = %p\n", server);
+ return server;
+
+out_free_server:
+ nfs_free_fattr(fattr_fsinfo);
+ nfs_free_server(server);
+ dprintk("<-- nfs_clone_server() = error %d\n", error);
+ return ERR_PTR(error);
+}
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_fs_nfs;
+
+static int nfs_server_list_open(struct inode *inode, struct file *file);
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_server_list_ops = {
+ .start = nfs_server_list_start,
+ .next = nfs_server_list_next,
+ .stop = nfs_server_list_stop,
+ .show = nfs_server_list_show,
+};
+
+static const struct file_operations nfs_server_list_fops = {
+ .open = nfs_server_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .owner = THIS_MODULE,
+};
+
+static int nfs_volume_list_open(struct inode *inode, struct file *file);
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+
+static const struct seq_operations nfs_volume_list_ops = {
+ .start = nfs_volume_list_start,
+ .next = nfs_volume_list_next,
+ .stop = nfs_volume_list_stop,
+ .show = nfs_volume_list_show,
+};
+
+static const struct file_operations nfs_volume_list_fops = {
+ .open = nfs_volume_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
+ * we're dealing
+ */
+static int nfs_server_list_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &nfs_server_list_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+ m->private = PDE(inode)->data;
+
+ return 0;
+}
+
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+{
+ /* lock the list against modification */
+ spin_lock(&nfs_client_lock);
+ return seq_list_start_head(&nfs_client_list, *_pos);
+}
+
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &nfs_client_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+{
+ spin_unlock(&nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_client *clp;
+
+ /* display header on line 1 */
+ if (v == &nfs_client_list) {
+ seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
+ return 0;
+ }
+
+ /* display one transport per line on subsequent lines */
+ clp = list_entry(v, struct nfs_client, cl_share_link);
+
+ /* Check if the client is initialized */
+ if (clp->cl_cons_state != NFS_CS_READY)
+ return 0;
+
+ seq_printf(m, "v%u %s %s %3d %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ atomic_read(&clp->cl_count),
+ clp->cl_hostname);
+
+ return 0;
+}
+
+/*
+ * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
+ */
+static int nfs_volume_list_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &nfs_volume_list_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+ m->private = PDE(inode)->data;
+
+ return 0;
+}
+
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+{
+ /* lock the list against modification */
+ spin_lock(&nfs_client_lock);
+ return seq_list_start_head(&nfs_volume_list, *_pos);
+}
+
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &nfs_volume_list, pos);
+}
+
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+{
+ spin_unlock(&nfs_client_lock);
+}
+
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+ struct nfs_server *server;
+ struct nfs_client *clp;
+ char dev[8], fsid[17];
+
+ /* display header on line 1 */
+ if (v == &nfs_volume_list) {
+ seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
+ return 0;
+ }
+ /* display one transport per line on subsequent lines */
+ server = list_entry(v, struct nfs_server, master_link);
+ clp = server->nfs_client;
+
+ snprintf(dev, 8, "%u:%u",
+ MAJOR(server->s_dev), MINOR(server->s_dev));
+
+ snprintf(fsid, 17, "%llx:%llx",
+ (unsigned long long) server->fsid.major,
+ (unsigned long long) server->fsid.minor);
+
+ seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+ clp->rpc_ops->version,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
+ dev,
+ fsid,
+ nfs_server_fscache_state(server));
+
+ return 0;
+}
+
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+ struct proc_dir_entry *p;
+
+ proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
+ if (!proc_fs_nfs)
+ goto error_0;
+
+ /* a file of servers with which we're dealing */
+ p = proc_create("servers", S_IFREG|S_IRUGO,
+ proc_fs_nfs, &nfs_server_list_fops);
+ if (!p)
+ goto error_1;
+
+ /* a file of volumes that we have mounted */
+ p = proc_create("volumes", S_IFREG|S_IRUGO,
+ proc_fs_nfs, &nfs_volume_list_fops);
+ if (!p)
+ goto error_2;
+ return 0;
+
+error_2:
+ remove_proc_entry("servers", proc_fs_nfs);
+error_1:
+ remove_proc_entry("fs/nfsfs", NULL);
+error_0:
+ return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+ remove_proc_entry("volumes", proc_fs_nfs);
+ remove_proc_entry("servers", proc_fs_nfs);
+ remove_proc_entry("fs/nfsfs", NULL);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+ "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
new file mode 100644
index 00000000000..ac889af8ccf
--- /dev/null
+++ b/fs/nfs/delegation.c
@@ -0,0 +1,716 @@
+/*
+ * linux/fs/nfs/delegation.c
+ *
+ * Copyright (C) 2004 Trond Myklebust
+ *
+ * NFS file delegation management
+ *
+ */
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+ if (delegation->cred) {
+ put_rpccred(delegation->cred);
+ delegation->cred = NULL;
+ }
+ kfree_rcu(delegation, rcu);
+}
+
+/**
+ * nfs_mark_delegation_referenced - set delegation's REFERENCED flag
+ * @delegation: delegation to process
+ *
+ */
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+/**
+ * nfs_have_delegation - check if inode has a delegation
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+ struct nfs_delegation *delegation;
+ int ret = 0;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL && (delegation->type & flags) == flags) {
+ nfs_mark_delegation_referenced(delegation);
+ ret = 1;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
+{
+ struct inode *inode = state->inode;
+ struct file_lock *fl;
+ int status = 0;
+
+ if (inode->i_flock == NULL)
+ goto out;
+
+ /* Protect inode->i_flock using the file locks lock */
+ lock_flocks();
+ for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+ if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+ continue;
+ if (nfs_file_open_context(fl->fl_file) != ctx)
+ continue;
+ unlock_flocks();
+ status = nfs4_lock_delegation_recall(state, fl);
+ if (status < 0)
+ goto out;
+ lock_flocks();
+ }
+ unlock_flocks();
+out:
+ return status;
+}
+
+static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ int err;
+
+again:
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ if (state == NULL)
+ continue;
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+ continue;
+ if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0)
+ continue;
+ get_nfs_open_context(ctx);
+ spin_unlock(&inode->i_lock);
+ err = nfs4_open_delegation_recall(ctx, state, stateid);
+ if (err >= 0)
+ err = nfs_delegation_claim_locks(ctx, state);
+ put_nfs_open_context(ctx);
+ if (err != 0)
+ return err;
+ goto again;
+ }
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
+/**
+ * nfs_inode_reclaim_delegation - process a delegation reclaim request
+ * @inode: inode to process
+ * @cred: credential to use for request
+ * @res: new delegation state from server
+ *
+ */
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_openres *res)
+{
+ struct nfs_delegation *delegation;
+ struct rpc_cred *oldcred = NULL;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL) {
+ memcpy(delegation->stateid.data, res->delegation.data,
+ sizeof(delegation->stateid.data));
+ delegation->type = res->delegation_type;
+ delegation->maxsize = res->maxsize;
+ oldcred = delegation->cred;
+ delegation->cred = get_rpccred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags);
+ NFS_I(inode)->delegation_state = delegation->type;
+ spin_unlock(&delegation->lock);
+ put_rpccred(oldcred);
+ rcu_read_unlock();
+ } else {
+ /* We appear to have raced with a delegation return. */
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, res);
+ }
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ int res = 0;
+
+ res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ nfs_free_delegation(delegation);
+ return res;
+}
+
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL)
+ inode = igrab(delegation->inode);
+ spin_unlock(&delegation->lock);
+ return inode;
+}
+
+static struct nfs_delegation *
+nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+ struct nfs_server *server)
+{
+ struct nfs_delegation *delegation =
+ rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&server->nfs_client->cl_lock));
+
+ if (delegation == NULL)
+ goto nomatch;
+
+ spin_lock(&delegation->lock);
+ list_del_rcu(&delegation->super_list);
+ delegation->inode = NULL;
+ nfsi->delegation_state = 0;
+ rcu_assign_pointer(nfsi->delegation, NULL);
+ spin_unlock(&delegation->lock);
+ return delegation;
+nomatch:
+ return NULL;
+}
+
+static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
+ struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_delegation *delegation;
+
+ spin_lock(&clp->cl_lock);
+ delegation = nfs_detach_delegation_locked(nfsi, server);
+ spin_unlock(&clp->cl_lock);
+ return delegation;
+}
+
+/**
+ * nfs_inode_set_delegation - set up a delegation on an inode
+ * @inode: inode to which delegation applies
+ * @cred: cred to use for subsequent delegation processing
+ * @res: new delegation state from server
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation, *old_delegation;
+ struct nfs_delegation *freeme = NULL;
+ int status = 0;
+
+ delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+ if (delegation == NULL)
+ return -ENOMEM;
+ memcpy(delegation->stateid.data, res->delegation.data,
+ sizeof(delegation->stateid.data));
+ delegation->type = res->delegation_type;
+ delegation->maxsize = res->maxsize;
+ delegation->change_attr = inode->i_version;
+ delegation->cred = get_rpccred(cred);
+ delegation->inode = inode;
+ delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ spin_lock_init(&delegation->lock);
+
+ spin_lock(&clp->cl_lock);
+ old_delegation = rcu_dereference_protected(nfsi->delegation,
+ lockdep_is_held(&clp->cl_lock));
+ if (old_delegation != NULL) {
+ if (memcmp(&delegation->stateid, &old_delegation->stateid,
+ sizeof(old_delegation->stateid)) == 0 &&
+ delegation->type == old_delegation->type) {
+ goto out;
+ }
+ /*
+ * Deal with broken servers that hand out two
+ * delegations for the same file.
+ */
+ dfprintk(FILE, "%s: server %s handed out "
+ "a duplicate delegation!\n",
+ __func__, clp->cl_hostname);
+ if (delegation->type <= old_delegation->type) {
+ freeme = delegation;
+ delegation = NULL;
+ goto out;
+ }
+ freeme = nfs_detach_delegation_locked(nfsi, server);
+ }
+ list_add_rcu(&delegation->super_list, &server->delegations);
+ nfsi->delegation_state = delegation->type;
+ rcu_assign_pointer(nfsi->delegation, delegation);
+ delegation = NULL;
+
+ /* Ensure we revalidate the attributes and page cache! */
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+ spin_unlock(&inode->i_lock);
+
+out:
+ spin_unlock(&clp->cl_lock);
+ if (delegation != NULL)
+ nfs_free_delegation(delegation);
+ if (freeme != NULL)
+ nfs_do_return_delegation(inode, freeme, 0);
+ return status;
+}
+
+/*
+ * Basic procedure for returning a delegation to the server
+ */
+static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int err;
+
+ /*
+ * Guard against new delegated open/lock/unlock calls and against
+ * state recovery
+ */
+ down_write(&nfsi->rwsem);
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+ up_write(&nfsi->rwsem);
+ if (err)
+ goto out;
+
+ err = nfs_do_return_delegation(inode, delegation, issync);
+out:
+ return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_server *server;
+ struct inode *inode;
+ int err = 0;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(delegation, &server->delegations,
+ super_list) {
+ if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
+ &delegation->flags))
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ continue;
+ delegation = nfs_detach_delegation(NFS_I(inode),
+ server);
+ rcu_read_unlock();
+
+ if (delegation != NULL) {
+ filemap_flush(inode->i_mapping);
+ err = __nfs_inode_return_delegation(inode,
+ delegation, 0);
+ }
+ iput(inode);
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+ return err;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens
+ * @inode: inode to process
+ *
+ * Does not protect against delegation reclaims, therefore really only safe
+ * to be called from nfs4_clear_inode().
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+
+ if (rcu_access_pointer(nfsi->delegation) != NULL) {
+ delegation = nfs_detach_delegation(nfsi, server);
+ if (delegation != NULL)
+ nfs_do_return_delegation(inode, delegation, 0);
+ }
+}
+
+/**
+ * nfs_inode_return_delegation - synchronously return a delegation
+ * @inode: inode to process
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_inode_return_delegation(struct inode *inode)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ int err = 0;
+
+ if (rcu_access_pointer(nfsi->delegation) != NULL) {
+ delegation = nfs_detach_delegation(nfsi, server);
+ if (delegation != NULL) {
+ nfs_wb_all(inode);
+ err = __nfs_inode_return_delegation(inode, delegation, 1);
+ }
+ }
+ return err;
+}
+
+static void nfs_mark_return_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
+/**
+ * nfs_super_return_all_delegations - return delegations for one superblock
+ * @sb: sb to process
+ *
+ */
+void nfs_super_return_all_delegations(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_delegation *delegation;
+
+ if (clp == NULL)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ spin_lock(&delegation->lock);
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ spin_unlock(&delegation->lock);
+ }
+ rcu_read_unlock();
+
+ if (nfs_client_return_marked_delegations(clp) != 0)
+ nfs4_schedule_state_manager(clp);
+}
+
+static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+ fmode_t flags)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
+ continue;
+ if (delegation->type & flags)
+ nfs_mark_return_delegation(server, delegation);
+ }
+}
+
+static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+ fmode_t flags)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_all_delegation_types(server, flags);
+ rcu_read_unlock();
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+ nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+ if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+ nfs4_schedule_state_manager(clp);
+}
+
+void nfs_remove_bad_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+ if (delegation) {
+ nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+ nfs_free_delegation(delegation);
+ }
+}
+
+/**
+ * nfs_expire_all_delegation_types
+ * @clp: client to process
+ * @flags: delegation types to expire
+ *
+ */
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+{
+ nfs_client_mark_return_all_delegation_types(clp, flags);
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+ nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
+}
+
+/**
+ * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
+ * @clp: client to process
+ *
+ */
+void nfs_handle_cb_pathdown(struct nfs_client *clp)
+{
+ if (clp == NULL)
+ return;
+ nfs_client_mark_return_all_delegations(clp);
+}
+
+static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
+ continue;
+ nfs_mark_return_delegation(server, delegation);
+ }
+}
+
+/**
+ * nfs_expire_unreferenced_delegations - Eliminate unused delegations
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_mark_return_unreferenced_delegations(server);
+ rcu_read_unlock();
+
+ nfs_delegation_run_state_manager(clp);
+}
+
+/**
+ * nfs_async_inode_return_delegation - asynchronously return a delegation
+ * @inode: inode to process
+ * @stateid: state ID information from CB_RECALL arguments
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_async_inode_return_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs_delegation *delegation;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+
+ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ nfs_mark_return_delegation(server, delegation);
+ rcu_read_unlock();
+
+ nfs_delegation_run_state_manager(clp);
+ return 0;
+}
+
+static struct inode *
+nfs_delegation_find_inode_server(struct nfs_server *server,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_delegation *delegation;
+ struct inode *res = NULL;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL &&
+ nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+ res = igrab(delegation->inode);
+ }
+ spin_unlock(&delegation->lock);
+ if (res != NULL)
+ break;
+ }
+ return res;
+}
+
+/**
+ * nfs_delegation_find_inode - retrieve the inode associated with a delegation
+ * @clp: client state handle
+ * @fhandle: filehandle from a delegation recall
+ *
+ * Returns pointer to inode matching "fhandle," or NULL if a matching inode
+ * cannot be found.
+ */
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fhandle)
+{
+ struct nfs_server *server;
+ struct inode *res = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ res = nfs_delegation_find_inode_server(server, fhandle);
+ if (res != NULL)
+ break;
+ }
+ rcu_read_unlock();
+ return res;
+}
+
+static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
+{
+ struct nfs_delegation *delegation;
+
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+ set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+}
+
+/**
+ * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ nfs_delegation_mark_reclaim_server(server);
+ rcu_read_unlock();
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_server *server;
+ struct inode *inode;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry_rcu(delegation, &server->delegations,
+ super_list) {
+ if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ continue;
+ delegation = nfs_detach_delegation(NFS_I(inode),
+ server);
+ rcu_read_unlock();
+
+ if (delegation != NULL)
+ nfs_free_delegation(delegation);
+ iput(inode);
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * nfs_delegations_present - check for existence of delegations
+ * @clp: client state handle
+ *
+ * Returns one if there are any nfs_delegation structures attached
+ * to this nfs_client.
+ */
+int nfs_delegations_present(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (!list_empty(&server->delegations)) {
+ ret = 1;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * nfs4_copy_delegation_stateid - Copy inode's state ID information
+ * @dst: stateid data structure to fill in
+ * @inode: inode to check
+ *
+ * Returns one and fills in "dst->data" * if inode had a delegation,
+ * otherwise zero is returned.
+ */
+int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ int ret = 0;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation != NULL) {
+ memcpy(dst->data, delegation->stateid.data, sizeof(dst->data));
+ ret = 1;
+ }
+ rcu_read_unlock();
+ return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
new file mode 100644
index 00000000000..691a7960918
--- /dev/null
+++ b/fs/nfs/delegation.h
@@ -0,0 +1,80 @@
+/*
+ * linux/fs/nfs/delegation.h
+ *
+ * Copyright (c) Trond Myklebust
+ *
+ * Definitions pertaining to NFS delegated files
+ */
+#ifndef FS_NFS_DELEGATION_H
+#define FS_NFS_DELEGATION_H
+
+#if defined(CONFIG_NFS_V4)
+/*
+ * NFSv4 delegation
+ */
+struct nfs_delegation {
+ struct list_head super_list;
+ struct rpc_cred *cred;
+ struct inode *inode;
+ nfs4_stateid stateid;
+ fmode_t type;
+ loff_t maxsize;
+ __u64 change_attr;
+ unsigned long flags;
+ spinlock_t lock;
+ struct rcu_head rcu;
+};
+
+enum {
+ NFS_DELEGATION_NEED_RECLAIM = 0,
+ NFS_DELEGATION_RETURN,
+ NFS_DELEGATION_REFERENCED,
+};
+
+int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
+int nfs_inode_return_delegation(struct inode *inode);
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
+
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
+void nfs_super_return_all_delegations(struct super_block *sb);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
+void nfs_handle_cb_pathdown(struct nfs_client *clp);
+int nfs_client_return_marked_delegations(struct nfs_client *clp);
+int nfs_delegations_present(struct nfs_client *clp);
+void nfs_remove_bad_delegation(struct inode *inode);
+
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
+
+/* NFSv4 delegation-related procedures */
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
+int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
+
+#else
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+ return 0;
+}
+
+static inline int nfs_inode_return_delegation(struct inode *inode)
+{
+ return 0;
+}
+#endif
+
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+ return nfs_have_delegation(inode, FMODE_READ) &&
+ !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
+#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
new file mode 100644
index 00000000000..fd9a872fada
--- /dev/null
+++ b/fs/nfs/dir.c
@@ -0,0 +1,2349 @@
+/*
+ * linux/fs/nfs/dir.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs directory handling functions
+ *
+ * 10 Apr 1996 Added silly rename for unlink --okir
+ * 28 Sep 1996 Improved directory cache --okir
+ * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de
+ * Re-implemented silly rename for unlink, newly implemented
+ * silly rename for nfs_rename() following the suggestions
+ * of Olaf Kirch (okir) found in this file.
+ * Following Linus comments on my original hack, this version
+ * depends only on the dcache stuff and doesn't touch the inode
+ * layer (iput() and friends).
+ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM
+ */
+
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/kmemleak.h>
+#include <linux/xattr.h>
+
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+
+/* #define NFS_DEBUG_VERBOSE 1 */
+
+static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
+static int nfs_readdir(struct file *, void *, filldir_t);
+static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+static int nfs_mkdir(struct inode *, struct dentry *, umode_t);
+static int nfs_rmdir(struct inode *, struct dentry *);
+static int nfs_unlink(struct inode *, struct dentry *);
+static int nfs_symlink(struct inode *, struct dentry *, const char *);
+static int nfs_link(struct dentry *, struct inode *, struct dentry *);
+static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int nfs_rename(struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
+static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
+static void nfs_readdir_clear_array(struct page*);
+
+const struct file_operations nfs_dir_operations = {
+ .llseek = nfs_llseek_dir,
+ .read = generic_read_dir,
+ .readdir = nfs_readdir,
+ .open = nfs_opendir,
+ .release = nfs_closedir,
+ .fsync = nfs_fsync_dir,
+};
+
+const struct inode_operations nfs_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+const struct address_space_operations nfs_dir_aops = {
+ .freepage = nfs_readdir_clear_array,
+};
+
+#ifdef CONFIG_NFS_V3
+const struct inode_operations nfs3_dir_inode_operations = {
+ .create = nfs_create,
+ .lookup = nfs_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs3_listxattr,
+ .getxattr = nfs3_getxattr,
+ .setxattr = nfs3_setxattr,
+ .removexattr = nfs3_removexattr,
+};
+#endif /* CONFIG_NFS_V3 */
+
+#ifdef CONFIG_NFS_V4
+
+static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
+static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd);
+const struct inode_operations nfs4_dir_inode_operations = {
+ .create = nfs_open_create,
+ .lookup = nfs_atomic_lookup,
+ .link = nfs_link,
+ .unlink = nfs_unlink,
+ .symlink = nfs_symlink,
+ .mkdir = nfs_mkdir,
+ .rmdir = nfs_rmdir,
+ .mknod = nfs_mknod,
+ .rename = nfs_rename,
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .getxattr = generic_getxattr,
+ .setxattr = generic_setxattr,
+ .listxattr = generic_listxattr,
+ .removexattr = generic_removexattr,
+};
+
+#endif /* CONFIG_NFS_V4 */
+
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
+{
+ struct nfs_open_dir_context *ctx;
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (ctx != NULL) {
+ ctx->duped = 0;
+ ctx->attr_gencount = NFS_I(dir)->attr_gencount;
+ ctx->dir_cookie = 0;
+ ctx->dup_cookie = 0;
+ ctx->cred = get_rpccred(cred);
+ return ctx;
+ }
+ return ERR_PTR(-ENOMEM);
+}
+
+static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+{
+ put_rpccred(ctx->cred);
+ kfree(ctx);
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_opendir(struct inode *inode, struct file *filp)
+{
+ int res = 0;
+ struct nfs_open_dir_context *ctx;
+ struct rpc_cred *cred;
+
+ dfprintk(FILE, "NFS: open dir(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+
+ cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+ ctx = alloc_nfs_open_dir_context(inode, cred);
+ if (IS_ERR(ctx)) {
+ res = PTR_ERR(ctx);
+ goto out;
+ }
+ filp->private_data = ctx;
+ if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+ /* This is a mountpoint, so d_revalidate will never
+ * have been called, so we need to refresh the
+ * inode (for close-open consistency) ourselves.
+ */
+ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ }
+out:
+ put_rpccred(cred);
+ return res;
+}
+
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+ put_nfs_open_dir_context(filp->private_data);
+ return 0;
+}
+
+struct nfs_cache_array_entry {
+ u64 cookie;
+ u64 ino;
+ struct qstr string;
+ unsigned char d_type;
+};
+
+struct nfs_cache_array {
+ unsigned int size;
+ int eof_index;
+ u64 last_cookie;
+ struct nfs_cache_array_entry array[0];
+};
+
+typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
+typedef struct {
+ struct file *file;
+ struct page *page;
+ unsigned long page_index;
+ u64 *dir_cookie;
+ u64 last_cookie;
+ loff_t current_index;
+ decode_dirent_t decode;
+
+ unsigned long timestamp;
+ unsigned long gencount;
+ unsigned int cache_entry_index;
+ unsigned int plus:1;
+ unsigned int eof:1;
+} nfs_readdir_descriptor_t;
+
+/*
+ * The caller is responsible for calling nfs_readdir_release_array(page)
+ */
+static
+struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
+{
+ void *ptr;
+ if (page == NULL)
+ return ERR_PTR(-EIO);
+ ptr = kmap(page);
+ if (ptr == NULL)
+ return ERR_PTR(-ENOMEM);
+ return ptr;
+}
+
+static
+void nfs_readdir_release_array(struct page *page)
+{
+ kunmap(page);
+}
+
+/*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+static
+void nfs_readdir_clear_array(struct page *page)
+{
+ struct nfs_cache_array *array;
+ int i;
+
+ array = kmap_atomic(page, KM_USER0);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
+ kunmap_atomic(array, KM_USER0);
+}
+
+/*
+ * the caller is responsible for freeing qstr.name
+ * when called by nfs_readdir_add_to_array, the strings will be freed in
+ * nfs_clear_readdir_array()
+ */
+static
+int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+{
+ string->len = len;
+ string->name = kmemdup(name, len, GFP_KERNEL);
+ if (string->name == NULL)
+ return -ENOMEM;
+ /*
+ * Avoid a kmemleak false positive. The pointer to the name is stored
+ * in a page cache page which kmemleak does not scan.
+ */
+ kmemleak_not_leak(string->name);
+ string->hash = full_name_hash(name, len);
+ return 0;
+}
+
+static
+int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+{
+ struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array_entry *cache_entry;
+ int ret;
+
+ if (IS_ERR(array))
+ return PTR_ERR(array);
+
+ cache_entry = &array->array[array->size];
+
+ /* Check that this entry lies within the page bounds */
+ ret = -ENOSPC;
+ if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+ goto out;
+
+ cache_entry->cookie = entry->prev_cookie;
+ cache_entry->ino = entry->ino;
+ cache_entry->d_type = entry->d_type;
+ ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
+ if (ret)
+ goto out;
+ array->last_cookie = entry->cookie;
+ array->size++;
+ if (entry->eof != 0)
+ array->eof_index = array->size;
+out:
+ nfs_readdir_release_array(page);
+ return ret;
+}
+
+static
+int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ loff_t diff = desc->file->f_pos - desc->current_index;
+ unsigned int index;
+
+ if (diff < 0)
+ goto out_eof;
+ if (diff >= array->size) {
+ if (array->eof_index >= 0)
+ goto out_eof;
+ return -EAGAIN;
+ }
+
+ index = (unsigned int)diff;
+ *desc->dir_cookie = array->array[index].cookie;
+ desc->cache_entry_index = index;
+ return 0;
+out_eof:
+ desc->eof = 1;
+ return -EBADCOOKIE;
+}
+
+static
+int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+{
+ int i;
+ loff_t new_pos;
+ int status = -EAGAIN;
+
+ for (i = 0; i < array->size; i++) {
+ if (array->array[i].cookie == *desc->dir_cookie) {
+ struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+ struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+ new_pos = desc->current_index + i;
+ if (ctx->attr_gencount != nfsi->attr_gencount
+ || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+ ctx->duped = 0;
+ ctx->attr_gencount = nfsi->attr_gencount;
+ } else if (new_pos < desc->file->f_pos) {
+ if (ctx->duped > 0
+ && ctx->dup_cookie == *desc->dir_cookie) {
+ if (printk_ratelimit()) {
+ pr_notice("NFS: directory %s/%s contains a readdir loop."
+ "Please contact your server vendor. "
+ "The file: %s has duplicate cookie %llu\n",
+ desc->file->f_dentry->d_parent->d_name.name,
+ desc->file->f_dentry->d_name.name,
+ array->array[i].string.name,
+ *desc->dir_cookie);
+ }
+ status = -ELOOP;
+ goto out;
+ }
+ ctx->dup_cookie = *desc->dir_cookie;
+ ctx->duped = -1;
+ }
+ desc->file->f_pos = new_pos;
+ desc->cache_entry_index = i;
+ return 0;
+ }
+ }
+ if (array->eof_index >= 0) {
+ status = -EBADCOOKIE;
+ if (*desc->dir_cookie == array->last_cookie)
+ desc->eof = 1;
+ }
+out:
+ return status;
+}
+
+static
+int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+{
+ struct nfs_cache_array *array;
+ int status;
+
+ array = nfs_readdir_get_array(desc->page);
+ if (IS_ERR(array)) {
+ status = PTR_ERR(array);
+ goto out;
+ }
+
+ if (*desc->dir_cookie == 0)
+ status = nfs_readdir_search_for_pos(array, desc);
+ else
+ status = nfs_readdir_search_for_cookie(array, desc);
+
+ if (status == -EAGAIN) {
+ desc->last_cookie = array->last_cookie;
+ desc->current_index += array->size;
+ desc->page_index++;
+ }
+ nfs_readdir_release_array(desc->page);
+out:
+ return status;
+}
+
+/* Fill a page with xdr information before transferring to the cache page */
+static
+int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
+ struct nfs_entry *entry, struct file *file, struct inode *inode)
+{
+ struct nfs_open_dir_context *ctx = file->private_data;
+ struct rpc_cred *cred = ctx->cred;
+ unsigned long timestamp, gencount;
+ int error;
+
+ again:
+ timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
+ error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+ NFS_SERVER(inode)->dtsize, desc->plus);
+ if (error < 0) {
+ /* We requested READDIRPLUS, but the server doesn't grok it */
+ if (error == -ENOTSUPP && desc->plus) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
+ clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ desc->plus = 0;
+ goto again;
+ }
+ goto error;
+ }
+ desc->timestamp = timestamp;
+ desc->gencount = gencount;
+error:
+ return error;
+}
+
+static int xdr_decode(nfs_readdir_descriptor_t *desc,
+ struct nfs_entry *entry, struct xdr_stream *xdr)
+{
+ int error;
+
+ error = desc->decode(xdr, entry, desc->plus);
+ if (error)
+ return error;
+ entry->fattr->time_start = desc->timestamp;
+ entry->fattr->gencount = desc->gencount;
+ return 0;
+}
+
+static
+int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
+{
+ if (dentry->d_inode == NULL)
+ goto different;
+ if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
+ goto different;
+ return 1;
+different:
+ return 0;
+}
+
+static
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+{
+ struct qstr filename = {
+ .len = entry->len,
+ .name = entry->name,
+ };
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+
+ if (filename.name[0] == '.') {
+ if (filename.len == 1)
+ return;
+ if (filename.len == 2 && filename.name[1] == '.')
+ return;
+ }
+ filename.hash = full_name_hash(filename.name, filename.len);
+
+ dentry = d_lookup(parent, &filename);
+ if (dentry != NULL) {
+ if (nfs_same_file(dentry, entry)) {
+ nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ goto out;
+ } else {
+ d_drop(dentry);
+ dput(dentry);
+ }
+ }
+
+ dentry = d_alloc(parent, &filename);
+ if (dentry == NULL)
+ return;
+
+ inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+ if (IS_ERR(inode))
+ goto out;
+
+ alias = d_materialise_unique(dentry, inode);
+ if (IS_ERR(alias))
+ goto out;
+ else if (alias) {
+ nfs_set_verifier(alias, nfs_save_change_attribute(dir));
+ dput(alias);
+ } else
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+out:
+ dput(dentry);
+}
+
+/* Perform conversion from xdr to cache array */
+static
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+ struct page **xdr_pages, struct page *page, unsigned int buflen)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct nfs_cache_array *array;
+ unsigned int count = 0;
+ int status;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (scratch == NULL)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ do {
+ status = xdr_decode(desc, entry, &stream);
+ if (status != 0) {
+ if (status == -EAGAIN)
+ status = 0;
+ break;
+ }
+
+ count++;
+
+ if (desc->plus != 0)
+ nfs_prime_dcache(desc->file->f_path.dentry, entry);
+
+ status = nfs_readdir_add_to_array(entry, page);
+ if (status != 0)
+ break;
+ } while (!entry->eof);
+
+ if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
+ array = nfs_readdir_get_array(page);
+ if (!IS_ERR(array)) {
+ array->eof_index = array->size;
+ status = 0;
+ nfs_readdir_release_array(page);
+ } else
+ status = PTR_ERR(array);
+ }
+
+ put_page(scratch);
+ return status;
+}
+
+static
+void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ put_page(pages[i]);
+}
+
+static
+void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+ unsigned int npages)
+{
+ nfs_readdir_free_pagearray(pages, npages);
+}
+
+/*
+ * nfs_readdir_large_page will allocate pages that must be freed with a call
+ * to nfs_readdir_free_large_page
+ */
+static
+int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out_freepages;
+ pages[i] = page;
+ }
+ return 0;
+
+out_freepages:
+ nfs_readdir_free_pagearray(pages, i);
+ return -ENOMEM;
+}
+
+static
+int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+{
+ struct page *pages[NFS_MAX_READDIR_PAGES];
+ void *pages_ptr = NULL;
+ struct nfs_entry entry;
+ struct file *file = desc->file;
+ struct nfs_cache_array *array;
+ int status = -ENOMEM;
+ unsigned int array_size = ARRAY_SIZE(pages);
+
+ entry.prev_cookie = 0;
+ entry.cookie = desc->last_cookie;
+ entry.eof = 0;
+ entry.fh = nfs_alloc_fhandle();
+ entry.fattr = nfs_alloc_fattr();
+ entry.server = NFS_SERVER(inode);
+ if (entry.fh == NULL || entry.fattr == NULL)
+ goto out;
+
+ array = nfs_readdir_get_array(page);
+ if (IS_ERR(array)) {
+ status = PTR_ERR(array);
+ goto out;
+ }
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+
+ status = nfs_readdir_large_page(pages, array_size);
+ if (status < 0)
+ goto out_release_array;
+ do {
+ unsigned int pglen;
+ status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
+
+ if (status < 0)
+ break;
+ pglen = status;
+ status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+ if (status < 0) {
+ if (status == -ENOSPC)
+ status = 0;
+ break;
+ }
+ } while (array->eof_index < 0);
+
+ nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+out_release_array:
+ nfs_readdir_release_array(page);
+out:
+ nfs_free_fattr(entry.fattr);
+ nfs_free_fhandle(entry.fh);
+ return status;
+}
+
+/*
+ * Now we cache directories properly, by converting xdr information
+ * to an array that can be used for lookups later. This results in
+ * fewer cache pages, since we can store more information on each page.
+ * We only need to convert from xdr once so future lookups are much simpler
+ */
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
+{
+ struct inode *inode = desc->file->f_path.dentry->d_inode;
+ int ret;
+
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
+ SetPageUptodate(page);
+
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
+ /* Should never happen */
+ nfs_zap_mapping(inode, inode->i_mapping);
+ }
+ unlock_page(page);
+ return 0;
+ error:
+ unlock_page(page);
+ return ret;
+}
+
+static
+void cache_page_release(nfs_readdir_descriptor_t *desc)
+{
+ if (!desc->page->mapping)
+ nfs_readdir_clear_array(desc->page);
+ page_cache_release(desc->page);
+ desc->page = NULL;
+}
+
+static
+struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+ desc->page_index, (filler_t *)nfs_readdir_filler, desc);
+}
+
+/*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
+ */
+static
+int find_cache_page(nfs_readdir_descriptor_t *desc)
+{
+ int res;
+
+ desc->page = get_cache_page(desc);
+ if (IS_ERR(desc->page))
+ return PTR_ERR(desc->page);
+
+ res = nfs_readdir_search_array(desc);
+ if (res != 0)
+ cache_page_release(desc);
+ return res;
+}
+
+/* Search for desc->dir_cookie from the beginning of the page cache */
+static inline
+int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+{
+ int res;
+
+ if (desc->page_index == 0) {
+ desc->current_index = 0;
+ desc->last_cookie = 0;
+ }
+ do {
+ res = find_cache_page(desc);
+ } while (res == -EAGAIN);
+ return res;
+}
+
+/*
+ * Once we've found the start of the dirent within a page: fill 'er up...
+ */
+static
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
+ filldir_t filldir)
+{
+ struct file *file = desc->file;
+ int i = 0;
+ int res = 0;
+ struct nfs_cache_array *array = NULL;
+ struct nfs_open_dir_context *ctx = file->private_data;
+
+ array = nfs_readdir_get_array(desc->page);
+ if (IS_ERR(array)) {
+ res = PTR_ERR(array);
+ goto out;
+ }
+
+ for (i = desc->cache_entry_index; i < array->size; i++) {
+ struct nfs_cache_array_entry *ent;
+
+ ent = &array->array[i];
+ if (filldir(dirent, ent->string.name, ent->string.len,
+ file->f_pos, nfs_compat_user_ino64(ent->ino),
+ ent->d_type) < 0) {
+ desc->eof = 1;
+ break;
+ }
+ file->f_pos++;
+ if (i < (array->size-1))
+ *desc->dir_cookie = array->array[i+1].cookie;
+ else
+ *desc->dir_cookie = array->last_cookie;
+ if (ctx->duped != 0)
+ ctx->duped = 1;
+ }
+ if (array->eof_index >= 0)
+ desc->eof = 1;
+
+ nfs_readdir_release_array(desc->page);
+out:
+ cache_page_release(desc);
+ dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+ (unsigned long long)*desc->dir_cookie, res);
+ return res;
+}
+
+/*
+ * If we cannot find a cookie in our cache, we suspect that this is
+ * because it points to a deleted file, so we ask the server to return
+ * whatever it thinks is the next entry. We then feed this to filldir.
+ * If all goes well, we should then be able to find our way round the
+ * cache on the next call to readdir_search_pagecache();
+ *
+ * NOTE: we cannot add the anonymous page to the pagecache because
+ * the data it contains might not be page aligned. Besides,
+ * we should already have a complete representation of the
+ * directory in the page cache by the time we get here.
+ */
+static inline
+int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
+ filldir_t filldir)
+{
+ struct page *page = NULL;
+ int status;
+ struct inode *inode = desc->file->f_path.dentry->d_inode;
+ struct nfs_open_dir_context *ctx = desc->file->private_data;
+
+ dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+ (unsigned long long)*desc->dir_cookie);
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ desc->page_index = 0;
+ desc->last_cookie = *desc->dir_cookie;
+ desc->page = page;
+ ctx->duped = 0;
+
+ status = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (status < 0)
+ goto out_release;
+
+ status = nfs_do_filldir(desc, dirent, filldir);
+
+ out:
+ dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+ __func__, status);
+ return status;
+ out_release:
+ cache_page_release(desc);
+ goto out;
+}
+
+/* The file offset position represents the dirent entry number. A
+ last cookie cache takes care of the common case of reading the
+ whole directory.
+ */
+static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ nfs_readdir_descriptor_t my_desc,
+ *desc = &my_desc;
+ struct nfs_open_dir_context *dir_ctx = filp->private_data;
+ int res;
+
+ dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ (long long)filp->f_pos);
+ nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
+
+ /*
+ * filp->f_pos points to the dirent entry number.
+ * *desc->dir_cookie has the cookie for the next entry. We have
+ * to either find the entry with the appropriate number or
+ * revalidate the cookie.
+ */
+ memset(desc, 0, sizeof(*desc));
+
+ desc->file = filp;
+ desc->dir_cookie = &dir_ctx->dir_cookie;
+ desc->decode = NFS_PROTO(inode)->decode_dirent;
+ desc->plus = NFS_USE_READDIRPLUS(inode);
+
+ nfs_block_sillyrename(dentry);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ if (res < 0)
+ goto out;
+
+ do {
+ res = readdir_search_pagecache(desc);
+
+ if (res == -EBADCOOKIE) {
+ res = 0;
+ /* This means either end of directory */
+ if (*desc->dir_cookie && desc->eof == 0) {
+ /* Or that the server has 'lost' a cookie */
+ res = uncached_readdir(desc, dirent, filldir);
+ if (res == 0)
+ continue;
+ }
+ break;
+ }
+ if (res == -ETOOSMALL && desc->plus) {
+ clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ nfs_zap_caches(inode);
+ desc->page_index = 0;
+ desc->plus = 0;
+ desc->eof = 0;
+ continue;
+ }
+ if (res < 0)
+ break;
+
+ res = nfs_do_filldir(desc, dirent, filldir);
+ if (res < 0)
+ break;
+ } while (!desc->eof);
+out:
+ nfs_unblock_sillyrename(dentry);
+ if (res > 0)
+ res = 0;
+ dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ res);
+ return res;
+}
+
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct nfs_open_dir_context *dir_ctx = filp->private_data;
+
+ dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name,
+ offset, origin);
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case 1:
+ offset += filp->f_pos;
+ case 0:
+ if (offset >= 0)
+ break;
+ default:
+ offset = -EINVAL;
+ goto out;
+ }
+ if (offset != filp->f_pos) {
+ filp->f_pos = offset;
+ dir_ctx->dir_cookie = 0;
+ dir_ctx->duped = 0;
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+/*
+ * All directory operations under NFS are synchronous, so fsync()
+ * is a dummy operation.
+ */
+static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ datasync);
+
+ mutex_lock(&inode->i_mutex);
+ nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+ NFS_I(dir)->cache_change_attribute++;
+}
+
+/*
+ * A check for whether or not the parent directory has changed.
+ * In the case it has, we assume that the dentries are untrustworthy
+ * and may need to be looked up again.
+ */
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+{
+ if (IS_ROOT(dentry))
+ return 1;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ return 0;
+ if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ return 0;
+ /* Revalidate nfsi->cache_change_attribute before we declare a match */
+ if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ return 0;
+ if (!nfs_verify_change_attribute(dir, dentry->d_time))
+ return 0;
+ return 1;
+}
+
+/*
+ * Return the intent data that applies to this particular path component
+ *
+ * Note that the current set of intents only apply to the very last
+ * component of the path and none of them is set before that last
+ * component.
+ */
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+ unsigned int mask)
+{
+ return nd->flags & mask;
+}
+
+/*
+ * Use intent information to check whether or not we're going to do
+ * an O_EXCL create using this path component.
+ */
+static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
+{
+ if (NFS_PROTO(dir)->version == 2)
+ return 0;
+ return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL);
+}
+
+/*
+ * Inode and filehandle revalidation for lookups.
+ *
+ * We force revalidation in the cases where the VFS sets LOOKUP_REVAL,
+ * or if the intent information indicates that we're about to open this
+ * particular file and the "nocto" mount flag is not set.
+ *
+ */
+static inline
+int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (IS_AUTOMOUNT(inode))
+ return 0;
+ if (nd != NULL) {
+ /* VFS wants an on-the-wire revalidation */
+ if (nd->flags & LOOKUP_REVAL)
+ goto out_force;
+ /* This is an open(2) */
+ if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 &&
+ !(server->flags & NFS_MOUNT_NOCTO) &&
+ (S_ISREG(inode->i_mode) ||
+ S_ISDIR(inode->i_mode)))
+ goto out_force;
+ return 0;
+ }
+ return nfs_revalidate_inode(server, inode);
+out_force:
+ return __nfs_revalidate_inode(server, inode);
+}
+
+/*
+ * We judge how long we want to trust negative
+ * dentries by looking at the parent inode mtime.
+ *
+ * If parent mtime has changed, we revalidate, else we wait for a
+ * period corresponding to the parent's attribute cache timeout value.
+ */
+static inline
+int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ /* Don't revalidate a negative dentry if we're creating a new file */
+ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
+ return 0;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+ return 1;
+ return !nfs_check_verifier(dir, dentry);
+}
+
+/*
+ * This is called every time the dcache has a lookup hit,
+ * and we should check whether we can really trust that
+ * lookup.
+ *
+ * NOTE! The hit can be a negative hit too, don't assume
+ * we have an inode!
+ *
+ * If the parent directory is seen to have changed, we throw out the
+ * cached dentry and do a new lookup.
+ */
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *dir;
+ struct inode *inode;
+ struct dentry *parent;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
+ int error;
+
+ if (nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
+ inode = dentry->d_inode;
+
+ if (!inode) {
+ if (nfs_neg_need_reval(dir, dentry, nd))
+ goto out_bad;
+ goto out_valid;
+ }
+
+ if (is_bad_inode(inode)) {
+ dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ goto out_bad;
+ }
+
+ if (nfs_have_delegation(inode, FMODE_READ))
+ goto out_set_verifier;
+
+ /* Force a full look up iff the parent directory has changed */
+ if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
+ if (nfs_lookup_verify_inode(inode, nd))
+ goto out_zap_parent;
+ goto out_valid;
+ }
+
+ if (NFS_STALE(inode))
+ goto out_bad;
+
+ error = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fhandle == NULL || fattr == NULL)
+ goto out_error;
+
+ error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+ if (error)
+ goto out_bad;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out_bad;
+ if ((error = nfs_refresh_inode(inode, fattr)) != 0)
+ goto out_bad;
+
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+out_set_verifier:
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ out_valid:
+ dput(parent);
+ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ return 1;
+out_zap_parent:
+ nfs_zap_caches(dir);
+ out_bad:
+ nfs_mark_for_revalidate(dir);
+ if (inode && S_ISDIR(inode->i_mode)) {
+ /* Purge readdir caches. */
+ nfs_zap_caches(inode);
+ /* If we have submounts, don't unhash ! */
+ if (have_submounts(dentry))
+ goto out_valid;
+ if (dentry->d_flags & DCACHE_DISCONNECTED)
+ goto out_valid;
+ shrink_dcache_parent(dentry);
+ }
+ d_drop(dentry);
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ dput(parent);
+ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ return 0;
+out_error:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ dput(parent);
+ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name, error);
+ return error;
+}
+
+/*
+ * This is called from dput() when d_count is going to 0.
+ */
+static int nfs_dentry_delete(const struct dentry *dentry)
+{
+ dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ dentry->d_flags);
+
+ /* Unhash any dentry with a stale inode */
+ if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+ return 1;
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ /* Unhash it, so that ->d_iput() would be called */
+ return 1;
+ }
+ if (!(dentry->d_sb->s_flags & MS_ACTIVE)) {
+ /* Unhash it, so that ancestors of killed async unlink
+ * files will be cleaned up during umount */
+ return 1;
+ }
+ return 0;
+
+}
+
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Called when the dentry loses inode.
+ * We use it to clean up silly-renamed files.
+ */
+static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode))
+ /* drop any readdir cache as it could easily be old */
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ drop_nlink(inode);
+ nfs_complete_unlink(dentry, inode);
+ }
+ iput(inode);
+}
+
+static void nfs_d_release(struct dentry *dentry)
+{
+ /* free cached devname value, if it survived that far */
+ if (unlikely(dentry->d_fsdata)) {
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ WARN_ON(1);
+ else
+ kfree(dentry->d_fsdata);
+ }
+}
+
+const struct dentry_operations nfs_dentry_operations = {
+ .d_revalidate = nfs_lookup_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+
+static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+{
+ struct dentry *res;
+ struct dentry *parent;
+ struct inode *inode = NULL;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
+ int error;
+
+ dfprintk(VFS, "NFS: lookup(%s/%s)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+ nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
+
+ res = ERR_PTR(-ENAMETOOLONG);
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ goto out;
+
+ /*
+ * If we're doing an exclusive create, optimize away the lookup
+ * but don't hash the dentry.
+ */
+ if (nfs_is_exclusive_create(dir, nd)) {
+ d_instantiate(dentry, NULL);
+ res = NULL;
+ goto out;
+ }
+
+ res = ERR_PTR(-ENOMEM);
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fhandle == NULL || fattr == NULL)
+ goto out;
+
+ parent = dentry->d_parent;
+ /* Protect against concurrent sillydeletes */
+ nfs_block_sillyrename(parent);
+ error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+ if (error == -ENOENT)
+ goto no_entry;
+ if (error < 0) {
+ res = ERR_PTR(error);
+ goto out_unblock_sillyrename;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ res = ERR_CAST(inode);
+ if (IS_ERR(res))
+ goto out_unblock_sillyrename;
+
+no_entry:
+ res = d_materialise_unique(dentry, inode);
+ if (res != NULL) {
+ if (IS_ERR(res))
+ goto out_unblock_sillyrename;
+ dentry = res;
+ }
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+out_unblock_sillyrename:
+ nfs_unblock_sillyrename(parent);
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ return res;
+}
+
+#ifdef CONFIG_NFS_V4
+static int nfs_open_revalidate(struct dentry *, struct nameidata *);
+
+const struct dentry_operations nfs4_dentry_operations = {
+ .d_revalidate = nfs_open_revalidate,
+ .d_delete = nfs_dentry_delete,
+ .d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
+ .d_release = nfs_d_release,
+};
+
+/*
+ * Use intent information to determine whether we need to substitute
+ * the NFSv4-style stateful OPEN for the LOOKUP call
+ */
+static int is_atomic_open(struct nameidata *nd)
+{
+ if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
+ return 0;
+ /* NFS does not (yet) have a stateful open for directories */
+ if (nd->flags & LOOKUP_DIRECTORY)
+ return 0;
+ /* Are we trying to write to a read only partition? */
+ if (__mnt_is_readonly(nd->path.mnt) &&
+ (nd->intent.open.flags & (O_CREAT|O_TRUNC|O_ACCMODE)))
+ return 0;
+ return 1;
+}
+
+static fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
+{
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ nfs_fscache_set_inode_cookie(inode, filp);
+ return 0;
+}
+
+static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx)
+{
+ struct file *filp;
+ int ret = 0;
+
+ /* If the open_intent is for execute, we have an extra check to make */
+ if (ctx->mode & FMODE_EXEC) {
+ ret = nfs_may_open(ctx->dentry->d_inode,
+ ctx->cred,
+ nd->intent.open.flags);
+ if (ret < 0)
+ goto out;
+ }
+ filp = lookup_instantiate_filp(nd, ctx->dentry, do_open);
+ if (IS_ERR(filp))
+ ret = PTR_ERR(filp);
+ else
+ nfs_file_set_open_context(filp, ctx);
+out:
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ struct nfs_open_context *ctx;
+ struct iattr attr;
+ struct dentry *res = NULL;
+ struct inode *inode;
+ int open_flags;
+ int err;
+
+ dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ /* Check that we are indeed trying to open this file */
+ if (!is_atomic_open(nd))
+ goto no_open;
+
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
+ res = ERR_PTR(-ENAMETOOLONG);
+ goto out;
+ }
+
+ /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
+ * the dentry. */
+ if (nd->flags & LOOKUP_EXCL) {
+ d_instantiate(dentry, NULL);
+ goto out;
+ }
+
+ open_flags = nd->intent.open.flags;
+
+ ctx = create_nfs_open_context(dentry, open_flags);
+ res = ERR_CAST(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+
+ if (nd->flags & LOOKUP_CREATE) {
+ attr.ia_mode = nd->intent.open.create_mode;
+ attr.ia_valid = ATTR_MODE;
+ attr.ia_mode &= ~current_umask();
+ } else {
+ open_flags &= ~(O_EXCL | O_CREAT);
+ attr.ia_valid = 0;
+ }
+
+ /* Open the file on the server */
+ nfs_block_sillyrename(dentry->d_parent);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+ if (IS_ERR(inode)) {
+ nfs_unblock_sillyrename(dentry->d_parent);
+ put_nfs_open_context(ctx);
+ switch (PTR_ERR(inode)) {
+ /* Make a negative dentry */
+ case -ENOENT:
+ d_add(dentry, NULL);
+ res = NULL;
+ goto out;
+ /* This turned out not to be a regular file */
+ case -EISDIR:
+ case -ENOTDIR:
+ goto no_open;
+ case -ELOOP:
+ if (!(nd->intent.open.flags & O_NOFOLLOW))
+ goto no_open;
+ /* case -EINVAL: */
+ default:
+ res = ERR_CAST(inode);
+ goto out;
+ }
+ }
+ res = d_add_unique(dentry, inode);
+ nfs_unblock_sillyrename(dentry->d_parent);
+ if (res != NULL) {
+ dput(ctx->dentry);
+ ctx->dentry = dget(res);
+ dentry = res;
+ }
+ err = nfs_intent_set_file(nd, ctx);
+ if (err < 0) {
+ if (res != NULL)
+ dput(res);
+ return ERR_PTR(err);
+ }
+out:
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return res;
+no_open:
+ return nfs_lookup(dir, dentry, nd);
+}
+
+static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *parent = NULL;
+ struct inode *inode;
+ struct inode *dir;
+ struct nfs_open_context *ctx;
+ int openflags, ret = 0;
+
+ if (nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = dentry->d_inode;
+ if (!is_atomic_open(nd) || d_mountpoint(dentry))
+ goto no_open;
+
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+
+ /* We can't create new files in nfs_open_revalidate(), so we
+ * optimize away revalidation of negative dentries.
+ */
+ if (inode == NULL) {
+ if (!nfs_neg_need_reval(dir, dentry, nd))
+ ret = 1;
+ goto out;
+ }
+
+ /* NFS only supports OPEN on regular files */
+ if (!S_ISREG(inode->i_mode))
+ goto no_open_dput;
+ openflags = nd->intent.open.flags;
+ /* We cannot do exclusive creation on a positive dentry */
+ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
+ goto no_open_dput;
+ /* We can't create new files, or truncate existing ones here */
+ openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
+
+ ctx = create_nfs_open_context(dentry, openflags);
+ ret = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out;
+ /*
+ * Note: we're not holding inode->i_mutex and so may be racing with
+ * operations that change the directory. We therefore save the
+ * change attribute *before* we do the RPC call.
+ */
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ switch (ret) {
+ case -EPERM:
+ case -EACCES:
+ case -EDQUOT:
+ case -ENOSPC:
+ case -EROFS:
+ goto out_put_ctx;
+ default:
+ goto out_drop;
+ }
+ }
+ iput(inode);
+ if (inode != dentry->d_inode)
+ goto out_drop;
+
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ ret = nfs_intent_set_file(nd, ctx);
+ if (ret >= 0)
+ ret = 1;
+out:
+ dput(parent);
+ return ret;
+out_drop:
+ d_drop(dentry);
+ ret = 0;
+out_put_ctx:
+ put_nfs_open_context(ctx);
+ goto out;
+
+no_open_dput:
+ dput(parent);
+no_open:
+ return nfs_lookup_revalidate(dentry, nd);
+}
+
+static int nfs_open_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd)
+{
+ struct nfs_open_context *ctx = NULL;
+ struct iattr attr;
+ int error;
+ int open_flags = O_CREAT|O_EXCL;
+
+ dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd)
+ open_flags = nd->intent.open.flags;
+
+ ctx = create_nfs_open_context(dentry, open_flags);
+ error = PTR_ERR(ctx);
+ if (IS_ERR(ctx))
+ goto out_err_drop;
+
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx);
+ if (error != 0)
+ goto out_put_ctx;
+ if (nd) {
+ error = nfs_intent_set_file(nd, ctx);
+ if (error < 0)
+ goto out_err;
+ } else {
+ put_nfs_open_context(ctx);
+ }
+ return 0;
+out_put_ctx:
+ put_nfs_open_context(ctx);
+out_err_drop:
+ d_drop(dentry);
+out_err:
+ return error;
+}
+
+#endif /* CONFIG_NFSV4 */
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+ int error = -EACCES;
+
+ d_drop(dentry);
+
+ /* We may have been initialized further down */
+ if (dentry->d_inode)
+ goto out;
+ if (fhandle->size == 0) {
+ error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
+ if (error)
+ goto out_error;
+ }
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_server *server = NFS_SB(dentry->d_sb);
+ error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+ if (error < 0)
+ goto out_error;
+ }
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+ error = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_error;
+ d_add(dentry, inode);
+out:
+ dput(parent);
+ return 0;
+out_error:
+ nfs_mark_for_revalidate(dir);
+ dput(parent);
+ return error;
+}
+
+/*
+ * Following a failed create operation, we drop the dentry rather
+ * than retain a negative dentry. This avoids a problem in the event
+ * that the operation succeeded on the server, but an error in the
+ * reply path made it appear to have failed.
+ */
+static int nfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd)
+{
+ struct iattr attr;
+ int error;
+ int open_flags = O_CREAT|O_EXCL;
+
+ dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd)
+ open_flags = nd->intent.open.flags;
+
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+static int
+nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ struct iattr attr;
+ int status;
+
+ dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+ status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
+ if (status != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return status;
+}
+
+/*
+ * See comments for nfs_proc_create regarding failed operations.
+ */
+static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct iattr attr;
+ int error;
+
+ dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ attr.ia_valid = ATTR_MODE;
+ attr.ia_mode = mode | S_IFDIR;
+
+ error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ if (error != 0)
+ goto out_err;
+ return 0;
+out_err:
+ d_drop(dentry);
+ return error;
+}
+
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+ if (dentry->d_inode != NULL && !d_unhashed(dentry))
+ d_delete(dentry);
+}
+
+static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+
+ dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+
+ error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
+ /* Ensure the VFS deletes this inode */
+ if (error == 0 && dentry->d_inode != NULL)
+ clear_nlink(dentry->d_inode);
+ else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(dentry);
+
+ return error;
+}
+
+/*
+ * Remove a file after making sure there are no pending writes,
+ * and after checking that the file has only one user.
+ *
+ * We invalidate the attribute cache and free the inode prior to the operation
+ * to avoid possible races if the server reuses the inode.
+ */
+static int nfs_safe_remove(struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *inode = dentry->d_inode;
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: safe_remove(%s/%s)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+
+ /* If the dentry was sillyrenamed, we simply call d_delete() */
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ error = 0;
+ goto out;
+ }
+
+ if (inode != NULL) {
+ nfs_inode_return_delegation(inode);
+ error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+ /* The VFS may want to delete this inode */
+ if (error == 0)
+ nfs_drop_nlink(inode);
+ nfs_mark_for_revalidate(inode);
+ } else
+ error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+ if (error == -ENOENT)
+ nfs_dentry_handle_enoent(dentry);
+out:
+ return error;
+}
+
+/* We do silly rename. In case sillyrename() returns -EBUSY, the inode
+ * belongs to an active ".nfs..." file and we return -EBUSY.
+ *
+ * If sillyrename() returns 0, we do nothing, otherwise we unlink.
+ */
+static int nfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int error;
+ int need_rehash = 0;
+
+ dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry->d_name.name);
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_count > 1) {
+ spin_unlock(&dentry->d_lock);
+ /* Start asynchronous writeout of the inode */
+ write_inode_now(dentry->d_inode, 0);
+ error = nfs_sillyrename(dir, dentry);
+ return error;
+ }
+ if (!d_unhashed(dentry)) {
+ __d_drop(dentry);
+ need_rehash = 1;
+ }
+ spin_unlock(&dentry->d_lock);
+ error = nfs_safe_remove(dentry);
+ if (!error || error == -ENOENT) {
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ } else if (need_rehash)
+ d_rehash(dentry);
+ return error;
+}
+
+/*
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer. If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+ struct pagevec lru_pvec;
+ struct page *page;
+ char *kaddr;
+ struct iattr attr;
+ unsigned int pathlen = strlen(symname);
+ int error;
+
+ dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry->d_name.name, symname);
+
+ if (pathlen > PAGE_SIZE)
+ return -ENAMETOOLONG;
+
+ attr.ia_mode = S_IFLNK | S_IRWXUGO;
+ attr.ia_valid = ATTR_MODE;
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ return -ENOMEM;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(kaddr, symname, pathlen);
+ if (pathlen < PAGE_SIZE)
+ memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+ if (error != 0) {
+ dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
+ dir->i_sb->s_id, dir->i_ino,
+ dentry->d_name.name, symname, error);
+ d_drop(dentry);
+ __free_page(page);
+ return error;
+ }
+
+ /*
+ * No big deal if we can't add this page to the page cache here.
+ * READLINK will get the missing page from the server if needed.
+ */
+ pagevec_init(&lru_pvec, 0);
+ if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+ GFP_KERNEL)) {
+ pagevec_add(&lru_pvec, page);
+ pagevec_lru_add_file(&lru_pvec);
+ SetPageUptodate(page);
+ unlock_page(page);
+ } else
+ __free_page(page);
+
+ return 0;
+}
+
+static int
+nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int error;
+
+ dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n",
+ old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+
+ nfs_inode_return_delegation(inode);
+
+ d_drop(dentry);
+ error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
+ if (error == 0) {
+ ihold(inode);
+ d_add(dentry, inode);
+ }
+ return error;
+}
+
+/*
+ * RENAME
+ * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
+ * different file handle for the same inode after a rename (e.g. when
+ * moving to a different directory). A fail-safe method to do so would
+ * be to look up old_dir/old_name, create a link to new_dir/new_name and
+ * rename the old file using the sillyrename stuff. This way, the original
+ * file in old_dir will go away when the last process iput()s the inode.
+ *
+ * FIXED.
+ *
+ * It actually works quite well. One needs to have the possibility for
+ * at least one ".nfs..." file in each directory the file ever gets
+ * moved or linked to which happens automagically with the new
+ * implementation that only depends on the dcache stuff instead of
+ * using the inode layer
+ *
+ * Unfortunately, things are a little more complicated than indicated
+ * above. For a cross-directory move, we want to make sure we can get
+ * rid of the old inode after the operation. This means there must be
+ * no pending writes (if it's a file), and the use count must be 1.
+ * If these conditions are met, we can drop the dentries before doing
+ * the rename.
+ */
+static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct dentry *dentry = NULL, *rehash = NULL;
+ int error = -EBUSY;
+
+ dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
+ old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
+ new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
+ new_dentry->d_count);
+
+ /*
+ * For non-directories, check whether the target is busy and if so,
+ * make a copy of the dentry and then do a silly-rename. If the
+ * silly-rename succeeds, the copied dentry is hashed and becomes
+ * the new target.
+ */
+ if (new_inode && !S_ISDIR(new_inode->i_mode)) {
+ /*
+ * To prevent any new references to the target during the
+ * rename, we unhash the dentry in advance.
+ */
+ if (!d_unhashed(new_dentry)) {
+ d_drop(new_dentry);
+ rehash = new_dentry;
+ }
+
+ if (new_dentry->d_count > 2) {
+ int err;
+
+ /* copy the target dentry's name */
+ dentry = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!dentry)
+ goto out;
+
+ /* silly-rename the existing target ... */
+ err = nfs_sillyrename(new_dir, new_dentry);
+ if (err)
+ goto out;
+
+ new_dentry = dentry;
+ rehash = NULL;
+ new_inode = NULL;
+ }
+ }
+
+ nfs_inode_return_delegation(old_inode);
+ if (new_inode != NULL)
+ nfs_inode_return_delegation(new_inode);
+
+ error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
+ new_dir, &new_dentry->d_name);
+ nfs_mark_for_revalidate(old_inode);
+out:
+ if (rehash)
+ d_rehash(rehash);
+ if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
+ d_move(old_dentry, new_dentry);
+ nfs_set_verifier(new_dentry,
+ nfs_save_change_attribute(new_dir));
+ } else if (error == -ENOENT)
+ nfs_dentry_handle_enoent(old_dentry);
+
+ /* new dentry created? */
+ if (dentry)
+ dput(dentry);
+ return error;
+}
+
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+ put_rpccred(entry->cred);
+ kfree(entry);
+ smp_mb__before_atomic_dec();
+ atomic_long_dec(&nfs_access_nr_entries);
+ smp_mb__after_atomic_dec();
+}
+
+static void nfs_access_free_list(struct list_head *head)
+{
+ struct nfs_access_entry *cache;
+
+ while (!list_empty(head)) {
+ cache = list_entry(head->next, struct nfs_access_entry, lru);
+ list_del(&cache->lru);
+ nfs_access_free_entry(cache);
+ }
+}
+
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ LIST_HEAD(head);
+ struct nfs_inode *nfsi, *next;
+ struct nfs_access_entry *cache;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return (nr_to_scan == 0) ? 0 : -1;
+
+ spin_lock(&nfs_access_lru_lock);
+ list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
+ struct inode *inode;
+
+ if (nr_to_scan-- == 0)
+ break;
+ inode = &nfsi->vfs_inode;
+ spin_lock(&inode->i_lock);
+ if (list_empty(&nfsi->access_cache_entry_lru))
+ goto remove_lru_entry;
+ cache = list_entry(nfsi->access_cache_entry_lru.next,
+ struct nfs_access_entry, lru);
+ list_move(&cache->lru, &head);
+ rb_erase(&cache->rb_node, &nfsi->access_cache);
+ if (!list_empty(&nfsi->access_cache_entry_lru))
+ list_move_tail(&nfsi->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ else {
+remove_lru_entry:
+ list_del_init(&nfsi->access_cache_inode_lru);
+ smp_mb__before_clear_bit();
+ clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+ smp_mb__after_clear_bit();
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+ return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+}
+
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
+{
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node *n;
+ struct nfs_access_entry *entry;
+
+ /* Unhook entries from the cache */
+ while ((n = rb_first(root_node)) != NULL) {
+ entry = rb_entry(n, struct nfs_access_entry, rb_node);
+ rb_erase(n, root_node);
+ list_move(&entry->lru, head);
+ }
+ nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+}
+
+void nfs_access_zap_cache(struct inode *inode)
+{
+ LIST_HEAD(head);
+
+ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+ return;
+ /* Remove from global LRU init */
+ spin_lock(&nfs_access_lru_lock);
+ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+
+ spin_lock(&inode->i_lock);
+ __nfs_access_zap_cache(NFS_I(inode), &head);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
+}
+
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+{
+ struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+ struct nfs_access_entry *entry;
+
+ while (n != NULL) {
+ entry = rb_entry(n, struct nfs_access_entry, rb_node);
+
+ if (cred < entry->cred)
+ n = n->rb_left;
+ else if (cred > entry->cred)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_access_entry *cache;
+ int err = -ENOENT;
+
+ spin_lock(&inode->i_lock);
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ if (cache == NULL)
+ goto out;
+ if (!nfs_have_delegated_attributes(inode) &&
+ !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ goto out_stale;
+ res->jiffies = cache->jiffies;
+ res->cred = cache->cred;
+ res->mask = cache->mask;
+ list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+ err = 0;
+out:
+ spin_unlock(&inode->i_lock);
+ return err;
+out_stale:
+ rb_erase(&cache->rb_node, &nfsi->access_cache);
+ list_del(&cache->lru);
+ spin_unlock(&inode->i_lock);
+ nfs_access_free_entry(cache);
+ return -ENOENT;
+out_zap:
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ return -ENOENT;
+}
+
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct rb_root *root_node = &nfsi->access_cache;
+ struct rb_node **p = &root_node->rb_node;
+ struct rb_node *parent = NULL;
+ struct nfs_access_entry *entry;
+
+ spin_lock(&inode->i_lock);
+ while (*p != NULL) {
+ parent = *p;
+ entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+
+ if (set->cred < entry->cred)
+ p = &parent->rb_left;
+ else if (set->cred > entry->cred)
+ p = &parent->rb_right;
+ else
+ goto found;
+ }
+ rb_link_node(&set->rb_node, parent, p);
+ rb_insert_color(&set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ spin_unlock(&inode->i_lock);
+ return;
+found:
+ rb_replace_node(parent, &set->rb_node, root_node);
+ list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+ list_del(&entry->lru);
+ spin_unlock(&inode->i_lock);
+ nfs_access_free_entry(entry);
+}
+
+static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+ struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+ if (cache == NULL)
+ return;
+ RB_CLEAR_NODE(&cache->rb_node);
+ cache->jiffies = set->jiffies;
+ cache->cred = get_rpccred(set->cred);
+ cache->mask = set->mask;
+
+ nfs_access_add_rbtree(inode, cache);
+
+ /* Update accounting */
+ smp_mb__before_atomic_inc();
+ atomic_long_inc(&nfs_access_nr_entries);
+ smp_mb__after_atomic_inc();
+
+ /* Add inode to global LRU list */
+ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+ spin_lock(&nfs_access_lru_lock);
+ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+ &nfs_access_lru_list);
+ spin_unlock(&nfs_access_lru_lock);
+ }
+}
+
+static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
+{
+ struct nfs_access_entry cache;
+ int status;
+
+ status = nfs_access_get_cached(inode, cred, &cache);
+ if (status == 0)
+ goto out;
+
+ /* Be clever: ask server to check for all possible rights */
+ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
+ cache.cred = cred;
+ cache.jiffies = jiffies;
+ status = NFS_PROTO(inode)->access(inode, &cache);
+ if (status != 0) {
+ if (status == -ESTALE) {
+ nfs_zap_caches(inode);
+ if (!S_ISDIR(inode->i_mode))
+ set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ }
+ return status;
+ }
+ nfs_access_add_cache(inode, &cache);
+out:
+ if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+static int nfs_open_permission_mask(int openflags)
+{
+ int mask = 0;
+
+ if ((openflags & O_ACCMODE) != O_WRONLY)
+ mask |= MAY_READ;
+ if ((openflags & O_ACCMODE) != O_RDONLY)
+ mask |= MAY_WRITE;
+ if (openflags & __FMODE_EXEC)
+ mask |= MAY_EXEC;
+ return mask;
+}
+
+int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
+{
+ return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
+}
+
+int nfs_permission(struct inode *inode, int mask)
+{
+ struct rpc_cred *cred;
+ int res = 0;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ nfs_inc_stats(inode, NFSIOS_VFSACCESS);
+
+ if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ goto out;
+ /* Is this sys_access() ? */
+ if (mask & (MAY_ACCESS | MAY_CHDIR))
+ goto force_lookup;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFLNK:
+ goto out;
+ case S_IFREG:
+ /* NFSv4 has atomic_open... */
+ if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
+ && (mask & MAY_OPEN)
+ && !(mask & MAY_EXEC))
+ goto out;
+ break;
+ case S_IFDIR:
+ /*
+ * Optimize away all write operations, since the server
+ * will check permissions when we perform the op.
+ */
+ if ((mask & MAY_WRITE) && !(mask & MAY_READ))
+ goto out;
+ }
+
+force_lookup:
+ if (!NFS_PROTO(inode)->access)
+ goto out_notsup;
+
+ cred = rpc_lookup_cred();
+ if (!IS_ERR(cred)) {
+ res = nfs_do_access(inode, cred, mask);
+ put_rpccred(cred);
+ } else
+ res = PTR_ERR(cred);
+out:
+ if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+ res = -EACCES;
+
+ dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
+ inode->i_sb->s_id, inode->i_ino, mask, res);
+ return res;
+out_notsup:
+ res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (res == 0)
+ res = generic_permission(inode, mask);
+ goto out;
+}
+
+/*
+ * Local variables:
+ * version-control: t
+ * kept-new-versions: 5
+ * End:
+ */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
new file mode 100644
index 00000000000..1940f1a56a5
--- /dev/null
+++ b/fs/nfs/direct.c
@@ -0,0 +1,1039 @@
+/*
+ * linux/fs/nfs/direct.c
+ *
+ * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
+ *
+ * High-performance uncached I/O for the Linux NFS client
+ *
+ * There are important applications whose performance or correctness
+ * depends on uncached access to file data. Database clusters
+ * (multiple copies of the same instance running on separate hosts)
+ * implement their own cache coherency protocol that subsumes file
+ * system cache protocols. Applications that process datasets
+ * considerably larger than the client's memory do not always benefit
+ * from a local cache. A streaming video server, for instance, has no
+ * need to cache the contents of a file.
+ *
+ * When an application requests uncached I/O, all read and write requests
+ * are made directly to the server; data stored or fetched via these
+ * requests is not cached in the Linux page cache. The client does not
+ * correct unaligned requests from applications. All requested bytes are
+ * held on permanent storage before a direct write system call returns to
+ * an application.
+ *
+ * Solaris implements an uncached I/O facility called directio() that
+ * is used for backups and sequential I/O to very large files. Solaris
+ * also supports uncaching whole NFS partitions with "-o forcedirectio,"
+ * an undocumented mount option.
+ *
+ * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
+ * help from Andrew Morton.
+ *
+ * 18 Dec 2001 Initial implementation for 2.4 --cel
+ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
+ * 08 Jun 2003 Port to 2.5 APIs --cel
+ * 31 Mar 2004 Handle direct I/O without VFS support --cel
+ * 15 Sep 2004 Parallel async reads --cel
+ * 04 May 2005 support O_DIRECT with aio --cel
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/clnt.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/atomic.h>
+
+#include "internal.h"
+#include "iostat.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static struct kmem_cache *nfs_direct_cachep;
+
+/*
+ * This represents a set of asynchronous requests that we're waiting on
+ */
+struct nfs_direct_req {
+ struct kref kref; /* release manager */
+
+ /* I/O parameters */
+ struct nfs_open_context *ctx; /* file open context info */
+ struct nfs_lock_context *l_ctx; /* Lock context info */
+ struct kiocb * iocb; /* controlling i/o request */
+ struct inode * inode; /* target file of i/o */
+
+ /* completion state */
+ atomic_t io_count; /* i/os we're waiting for */
+ spinlock_t lock; /* protect completion state */
+ ssize_t count, /* bytes actually processed */
+ error; /* any reported error */
+ struct completion completion; /* wait for i/o completion */
+
+ /* commit state */
+ struct list_head rewrite_list; /* saved nfs_write_data structs */
+ struct nfs_write_data * commit_data; /* special write_data for commits */
+ int flags;
+#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ struct nfs_writeverf verf; /* unstable write verifier */
+};
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static const struct rpc_call_ops nfs_write_direct_ops;
+
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+ atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+ return atomic_dec_and_test(&dreq->io_count);
+}
+
+/**
+ * nfs_direct_IO - NFS address space operation for direct I/O
+ * @rw: direction (read or write)
+ * @iocb: target I/O control block
+ * @iov: array of vectors that define I/O buffer
+ * @pos: offset in file to begin the operation
+ * @nr_segs: size of iovec array
+ *
+ * The presence of this routine in the address space ops vector means
+ * the NFS client supports direct I/O. However, we shunt off direct
+ * read and write requests before the VFS gets them, so this method
+ * should never be called.
+ */
+ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+{
+ dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
+ iocb->ki_filp->f_path.dentry->d_name.name,
+ (long long) pos, nr_segs);
+
+ return -EINVAL;
+}
+
+static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count)
+{
+ unsigned int npages;
+ unsigned int i;
+
+ if (count == 0)
+ return;
+ pages += (pgbase >> PAGE_SHIFT);
+ npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ struct page *page = pages[i];
+ if (!PageCompound(page))
+ set_page_dirty(page);
+ }
+}
+
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
+{
+ unsigned int i;
+ for (i = 0; i < npages; i++)
+ page_cache_release(pages[i]);
+}
+
+static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
+{
+ struct nfs_direct_req *dreq;
+
+ dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
+ if (!dreq)
+ return NULL;
+
+ kref_init(&dreq->kref);
+ kref_get(&dreq->kref);
+ init_completion(&dreq->completion);
+ INIT_LIST_HEAD(&dreq->rewrite_list);
+ dreq->iocb = NULL;
+ dreq->ctx = NULL;
+ dreq->l_ctx = NULL;
+ spin_lock_init(&dreq->lock);
+ atomic_set(&dreq->io_count, 0);
+ dreq->count = 0;
+ dreq->error = 0;
+ dreq->flags = 0;
+
+ return dreq;
+}
+
+static void nfs_direct_req_free(struct kref *kref)
+{
+ struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+
+ if (dreq->l_ctx != NULL)
+ nfs_put_lock_context(dreq->l_ctx);
+ if (dreq->ctx != NULL)
+ put_nfs_open_context(dreq->ctx);
+ kmem_cache_free(nfs_direct_cachep, dreq);
+}
+
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+ kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
+{
+ ssize_t result = -EIOCBQUEUED;
+
+ /* Async requests don't wait here */
+ if (dreq->iocb)
+ goto out;
+
+ result = wait_for_completion_killable(&dreq->completion);
+
+ if (!result)
+ result = dreq->error;
+ if (!result)
+ result = dreq->count;
+
+out:
+ return (ssize_t) result;
+}
+
+/*
+ * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
+ */
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
+{
+ if (dreq->iocb) {
+ long res = (long) dreq->error;
+ if (!res)
+ res = (long) dreq->count;
+ aio_complete(dreq->iocb, res, 0);
+ }
+ complete_all(&dreq->completion);
+
+ nfs_direct_req_release(dreq);
+}
+
+/*
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete. This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ */
+static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_read_data *data = calldata;
+
+ nfs_readpage_result(task, data);
+}
+
+static void nfs_direct_read_release(void *calldata)
+{
+
+ struct nfs_read_data *data = calldata;
+ struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+ int status = data->task.tk_status;
+
+ spin_lock(&dreq->lock);
+ if (unlikely(status < 0)) {
+ dreq->error = status;
+ spin_unlock(&dreq->lock);
+ } else {
+ dreq->count += data->res.count;
+ spin_unlock(&dreq->lock);
+ nfs_direct_dirty_pages(data->pagevec,
+ data->args.pgbase,
+ data->res.count);
+ }
+ nfs_direct_release_pages(data->pagevec, data->npages);
+
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ nfs_readdata_free(data);
+}
+
+static const struct rpc_call_ops nfs_read_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+ .rpc_call_prepare = nfs_read_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+ .rpc_call_done = nfs_direct_read_result,
+ .rpc_release = nfs_direct_read_release,
+};
+
+/*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation. If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads. Read length accounting is
+ * handled automatically by nfs_direct_read_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+ const struct iovec *iov,
+ loff_t pos)
+{
+ struct nfs_open_context *ctx = dreq->ctx;
+ struct inode *inode = ctx->dentry->d_inode;
+ unsigned long user_addr = (unsigned long)iov->iov_base;
+ size_t count = iov->iov_len;
+ size_t rsize = NFS_SERVER(inode)->rsize;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs_read_direct_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+ unsigned int pgbase;
+ int result;
+ ssize_t started = 0;
+
+ do {
+ struct nfs_read_data *data;
+ size_t bytes;
+
+ pgbase = user_addr & ~PAGE_MASK;
+ bytes = min(rsize,count);
+
+ result = -ENOMEM;
+ data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
+ if (unlikely(!data))
+ break;
+
+ down_read(&current->mm->mmap_sem);
+ result = get_user_pages(current, current->mm, user_addr,
+ data->npages, 1, 0, data->pagevec, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (result < 0) {
+ nfs_readdata_free(data);
+ break;
+ }
+ if ((unsigned)result < data->npages) {
+ bytes = result * PAGE_SIZE;
+ if (bytes <= pgbase) {
+ nfs_direct_release_pages(data->pagevec, result);
+ nfs_readdata_free(data);
+ break;
+ }
+ bytes -= pgbase;
+ data->npages = result;
+ }
+
+ get_dreq(dreq);
+
+ data->req = (struct nfs_page *) dreq;
+ data->inode = inode;
+ data->cred = msg.rpc_cred;
+ data->args.fh = NFS_FH(inode);
+ data->args.context = ctx;
+ data->args.lock_context = dreq->l_ctx;
+ data->args.offset = pos;
+ data->args.pgbase = pgbase;
+ data->args.pages = data->pagevec;
+ data->args.count = bytes;
+ data->res.fattr = &data->fattr;
+ data->res.eof = 0;
+ data->res.count = bytes;
+ nfs_fattr_init(&data->fattr);
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+
+ task_setup_data.task = &data->task;
+ task_setup_data.callback_data = data;
+ NFS_PROTO(inode)->read_setup(data, &msg);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ break;
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct read call "
+ "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ bytes,
+ (unsigned long long)data->args.offset);
+
+ started += bytes;
+ user_addr += bytes;
+ pos += bytes;
+ /* FIXME: Remove this unnecessary math from final patch */
+ pgbase += bytes;
+ pgbase &= ~PAGE_MASK;
+ BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
+
+ count -= bytes;
+ } while (count != 0);
+
+ if (started)
+ return started;
+ return result < 0 ? (ssize_t) result : -EFAULT;
+}
+
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+ const struct iovec *iov,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ ssize_t result = -EINVAL;
+ size_t requested_bytes = 0;
+ unsigned long seg;
+
+ get_dreq(dreq);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *vec = &iov[seg];
+ result = nfs_direct_read_schedule_segment(dreq, vec, pos);
+ if (result < 0)
+ break;
+ requested_bytes += result;
+ if ((size_t)result < vec->iov_len)
+ break;
+ pos += vec->iov_len;
+ }
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+ return 0;
+}
+
+static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ ssize_t result = -ENOMEM;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct nfs_direct_req *dreq;
+
+ dreq = nfs_direct_req_alloc();
+ if (dreq == NULL)
+ goto out;
+
+ dreq->inode = inode;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (dreq->l_ctx == NULL)
+ goto out_release;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+
+ result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
+ if (!result)
+ result = nfs_direct_wait(dreq);
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
+{
+ while (!list_empty(&dreq->rewrite_list)) {
+ struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
+ list_del(&data->pages);
+ nfs_direct_release_pages(data->pagevec, data->npages);
+ nfs_writedata_free(data);
+ }
+}
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+ struct inode *inode = dreq->inode;
+ struct list_head *p;
+ struct nfs_write_data *data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = dreq->ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs_write_direct_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+
+ dreq->count = 0;
+ get_dreq(dreq);
+
+ list_for_each(p, &dreq->rewrite_list) {
+ data = list_entry(p, struct nfs_write_data, pages);
+
+ get_dreq(dreq);
+
+ /* Use stable writes */
+ data->args.stable = NFS_FILE_SYNC;
+
+ /*
+ * Reset data->res.
+ */
+ nfs_fattr_init(&data->fattr);
+ data->res.count = data->args.count;
+ memset(&data->verf, 0, sizeof(data->verf));
+
+ /*
+ * Reuse data->task; data->args should not have changed
+ * since the original request was sent.
+ */
+ task_setup_data.task = &data->task;
+ task_setup_data.callback_data = data;
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ NFS_PROTO(inode)->write_setup(data, &msg);
+
+ /*
+ * We're called via an RPC callback, so BKL is already held.
+ */
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task))
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq, inode);
+}
+
+static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_write_data *data = calldata;
+
+ /* Call the NFS version-specific code */
+ NFS_PROTO(data->inode)->commit_done(task, data);
+}
+
+static void nfs_direct_commit_release(void *calldata)
+{
+ struct nfs_write_data *data = calldata;
+ struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+ int status = data->task.tk_status;
+
+ if (status < 0) {
+ dprintk("NFS: %5u commit failed with error %d.\n",
+ data->task.tk_pid, status);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+ dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ }
+
+ dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
+ nfs_direct_write_complete(dreq, data->inode);
+ nfs_commit_free(data);
+}
+
+static const struct rpc_call_ops nfs_commit_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+ .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+ .rpc_call_done = nfs_direct_commit_result,
+ .rpc_release = nfs_direct_commit_release,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+ struct nfs_write_data *data = dreq->commit_data;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = dreq->ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+ .rpc_client = NFS_CLIENT(dreq->inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs_commit_direct_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+
+ data->inode = dreq->inode;
+ data->cred = msg.rpc_cred;
+
+ data->args.fh = NFS_FH(data->inode);
+ data->args.offset = 0;
+ data->args.count = 0;
+ data->args.context = dreq->ctx;
+ data->args.lock_context = dreq->l_ctx;
+ data->res.count = 0;
+ data->res.fattr = &data->fattr;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+
+ NFS_PROTO(data->inode)->commit_setup(data, &msg);
+
+ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+ dreq->commit_data = NULL;
+
+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task))
+ rpc_put_task(task);
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+ int flags = dreq->flags;
+
+ dreq->flags = 0;
+ switch (flags) {
+ case NFS_ODIRECT_DO_COMMIT:
+ nfs_direct_commit_schedule(dreq);
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ nfs_direct_write_reschedule(dreq);
+ break;
+ default:
+ if (dreq->commit_data != NULL)
+ nfs_commit_free(dreq->commit_data);
+ nfs_direct_free_writedata(dreq);
+ nfs_zap_mapping(inode, inode->i_mapping);
+ nfs_direct_complete(dreq);
+ }
+}
+
+static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+ dreq->commit_data = nfs_commitdata_alloc();
+ if (dreq->commit_data != NULL)
+ dreq->commit_data->req = (struct nfs_page *) dreq;
+}
+#else
+static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+ dreq->commit_data = NULL;
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+ nfs_direct_free_writedata(dreq);
+ nfs_zap_mapping(inode, inode->i_mapping);
+ nfs_direct_complete(dreq);
+}
+#endif
+
+static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_write_data *data = calldata;
+
+ nfs_writeback_done(task, data);
+}
+
+/*
+ * NB: Return the value of the first error return code. Subsequent
+ * errors after the first one are ignored.
+ */
+static void nfs_direct_write_release(void *calldata)
+{
+ struct nfs_write_data *data = calldata;
+ struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+ int status = data->task.tk_status;
+
+ spin_lock(&dreq->lock);
+
+ if (unlikely(status < 0)) {
+ /* An error has occurred, so we should not commit */
+ dreq->flags = 0;
+ dreq->error = status;
+ }
+ if (unlikely(dreq->error != 0))
+ goto out_unlock;
+
+ dreq->count += data->res.count;
+
+ if (data->res.verf->committed != NFS_FILE_SYNC) {
+ switch (dreq->flags) {
+ case 0:
+ memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ break;
+ case NFS_ODIRECT_DO_COMMIT:
+ if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
+ dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ }
+ }
+ }
+out_unlock:
+ spin_unlock(&dreq->lock);
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq, data->inode);
+}
+
+static const struct rpc_call_ops nfs_write_direct_ops = {
+#if defined(CONFIG_NFS_V4_1)
+ .rpc_call_prepare = nfs_write_prepare,
+#endif /* CONFIG_NFS_V4_1 */
+ .rpc_call_done = nfs_direct_write_result,
+ .rpc_release = nfs_direct_write_release,
+};
+
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes. Write length accounting is
+ * handled automatically by nfs_direct_write_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+ const struct iovec *iov,
+ loff_t pos, int sync)
+{
+ struct nfs_open_context *ctx = dreq->ctx;
+ struct inode *inode = ctx->dentry->d_inode;
+ unsigned long user_addr = (unsigned long)iov->iov_base;
+ size_t count = iov->iov_len;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = ctx->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = NFS_CLIENT(inode),
+ .rpc_message = &msg,
+ .callback_ops = &nfs_write_direct_ops,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ };
+ size_t wsize = NFS_SERVER(inode)->wsize;
+ unsigned int pgbase;
+ int result;
+ ssize_t started = 0;
+
+ do {
+ struct nfs_write_data *data;
+ size_t bytes;
+
+ pgbase = user_addr & ~PAGE_MASK;
+ bytes = min(wsize,count);
+
+ result = -ENOMEM;
+ data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
+ if (unlikely(!data))
+ break;
+
+ down_read(&current->mm->mmap_sem);
+ result = get_user_pages(current, current->mm, user_addr,
+ data->npages, 0, 0, data->pagevec, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (result < 0) {
+ nfs_writedata_free(data);
+ break;
+ }
+ if ((unsigned)result < data->npages) {
+ bytes = result * PAGE_SIZE;
+ if (bytes <= pgbase) {
+ nfs_direct_release_pages(data->pagevec, result);
+ nfs_writedata_free(data);
+ break;
+ }
+ bytes -= pgbase;
+ data->npages = result;
+ }
+
+ get_dreq(dreq);
+
+ list_move_tail(&data->pages, &dreq->rewrite_list);
+
+ data->req = (struct nfs_page *) dreq;
+ data->inode = inode;
+ data->cred = msg.rpc_cred;
+ data->args.fh = NFS_FH(inode);
+ data->args.context = ctx;
+ data->args.lock_context = dreq->l_ctx;
+ data->args.offset = pos;
+ data->args.pgbase = pgbase;
+ data->args.pages = data->pagevec;
+ data->args.count = bytes;
+ data->args.stable = sync;
+ data->res.fattr = &data->fattr;
+ data->res.count = bytes;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+
+ task_setup_data.task = &data->task;
+ task_setup_data.callback_data = data;
+ msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ NFS_PROTO(inode)->write_setup(data, &msg);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ break;
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct write call "
+ "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ bytes,
+ (unsigned long long)data->args.offset);
+
+ started += bytes;
+ user_addr += bytes;
+ pos += bytes;
+
+ /* FIXME: Remove this useless math from the final patch */
+ pgbase += bytes;
+ pgbase &= ~PAGE_MASK;
+ BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
+
+ count -= bytes;
+ } while (count != 0);
+
+ if (started)
+ return started;
+ return result < 0 ? (ssize_t) result : -EFAULT;
+}
+
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+ const struct iovec *iov,
+ unsigned long nr_segs,
+ loff_t pos, int sync)
+{
+ ssize_t result = 0;
+ size_t requested_bytes = 0;
+ unsigned long seg;
+
+ get_dreq(dreq);
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *vec = &iov[seg];
+ result = nfs_direct_write_schedule_segment(dreq, vec,
+ pos, sync);
+ if (result < 0)
+ break;
+ requested_bytes += result;
+ if ((size_t)result < vec->iov_len)
+ break;
+ pos += vec->iov_len;
+ }
+
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0) {
+ nfs_direct_req_release(dreq);
+ return result < 0 ? result : -EIO;
+ }
+
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq, dreq->inode);
+ return 0;
+}
+
+static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos,
+ size_t count)
+{
+ ssize_t result = -ENOMEM;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct nfs_direct_req *dreq;
+ size_t wsize = NFS_SERVER(inode)->wsize;
+ int sync = NFS_UNSTABLE;
+
+ dreq = nfs_direct_req_alloc();
+ if (!dreq)
+ goto out;
+ nfs_alloc_commit_data(dreq);
+
+ if (dreq->commit_data == NULL || count <= wsize)
+ sync = NFS_FILE_SYNC;
+
+ dreq->inode = inode;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (dreq->l_ctx == NULL)
+ goto out_release;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+
+ result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
+ if (!result)
+ result = nfs_direct_wait(dreq);
+out_release:
+ nfs_direct_req_release(dreq);
+out:
+ return result;
+}
+
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file. For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change. Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ ssize_t retval = -EINVAL;
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ size_t count;
+
+ count = iov_length(iov, nr_segs);
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+ dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ count, (long long) pos);
+
+ retval = 0;
+ if (!count)
+ goto out;
+
+ retval = nfs_sync_mapping(mapping);
+ if (retval)
+ goto out;
+
+ task_io_account_read(count);
+
+ retval = nfs_direct_read(iocb, iov, nr_segs, pos);
+ if (retval > 0)
+ iocb->ki_pos = pos + retval;
+
+out:
+ return retval;
+}
+
+/**
+ * nfs_file_direct_write - file direct write operation for NFS files
+ * @iocb: target I/O control block
+ * @iov: vector of user buffers from which to write data
+ * @nr_segs: size of iov vector
+ * @pos: byte offset in file where writing starts
+ *
+ * We use this function for direct writes instead of calling
+ * generic_file_aio_write() in order to avoid taking the inode
+ * semaphore and updating the i_size. The NFS server will set
+ * the new i_size and this client must read the updated size
+ * back into its cache. We let the server do generic write
+ * parameter checking and report problems.
+ *
+ * We eliminate local atime updates, see direct read above.
+ *
+ * We avoid unnecessary page cache invalidations for normal cached
+ * readers of this file.
+ *
+ * Note that O_APPEND is not supported for NFS direct writes, as there
+ * is no atomic O_APPEND write facility in the NFS protocol.
+ */
+ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ ssize_t retval = -EINVAL;
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ size_t count;
+
+ count = iov_length(iov, nr_segs);
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+ dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ count, (long long) pos);
+
+ retval = generic_write_checks(file, &pos, &count, 0);
+ if (retval)
+ goto out;
+
+ retval = -EINVAL;
+ if ((ssize_t) count < 0)
+ goto out;
+ retval = 0;
+ if (!count)
+ goto out;
+
+ retval = nfs_sync_mapping(mapping);
+ if (retval)
+ goto out;
+
+ task_io_account_write(count);
+
+ retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
+
+ if (retval > 0)
+ iocb->ki_pos = pos + retval;
+
+out:
+ return retval;
+}
+
+/**
+ * nfs_init_directcache - create a slab cache for nfs_direct_req structures
+ *
+ */
+int __init nfs_init_directcache(void)
+{
+ nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
+ sizeof(struct nfs_direct_req),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ if (nfs_direct_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
+ *
+ */
+void nfs_destroy_directcache(void)
+{
+ kmem_cache_destroy(nfs_direct_cachep);
+}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 00000000000..a6e711ad130
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,372 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ ssize_t ret;
+ char *ip_addr = NULL;
+ int ip_len;
+
+ ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+ if (ip_len > 0)
+ ret = rpc_pton(ip_addr, ip_len, sa, salen);
+ else
+ ret = -ESRCH;
+ kfree(ip_addr);
+ return ret;
+}
+
+#else
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+};
+
+
+static void nfs_dns_ent_update(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+}
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ nfs_dns_ent_update(cnew, ckey);
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+ int ret;
+
+ ret = nfs_cache_upcall(cd, key->hostname);
+ if (ret)
+ ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = item->h.expiry_time - seconds_since_boot();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned long ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ ttl = get_expiry(&buf);
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + seconds_since_boot();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .hash_table = nfs_dns_table,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_update,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < seconds_since_boot()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+
+ ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(sa, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, &nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+ return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ nfs_cache_unregister(&nfs_dns_resolve);
+}
+
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 00000000000..199bb5543a9
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,26 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+ return 0;
+}
+
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+#endif
+
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
new file mode 100644
index 00000000000..c43a452f7da
--- /dev/null
+++ b/fs/nfs/file.c
@@ -0,0 +1,899 @@
+/*
+ * linux/fs/nfs/file.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * Changes Copyright (C) 1994 by Florian La Roche
+ * - Do not copy data too often around in the kernel.
+ * - In nfs_file_read the return value of kmalloc wasn't checked.
+ * - Put in a better version of read look-ahead buffering. Original idea
+ * and implementation by Wai S Kok elekokws@ee.nus.sg.
+ *
+ * Expire cache on write to a file by Wai S Kok (Oct 1994).
+ *
+ * Total rewrite of read side for new NFS buffer cache.. Linus.
+ *
+ * nfs regular file handling functions
+ */
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include "delegation.h"
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_FILE
+
+static const struct vm_operations_struct nfs_file_vm_ops;
+
+const struct inode_operations nfs_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+};
+
+#ifdef CONFIG_NFS_V3
+const struct inode_operations nfs3_file_inode_operations = {
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
+ .listxattr = nfs3_listxattr,
+ .getxattr = nfs3_getxattr,
+ .setxattr = nfs3_setxattr,
+ .removexattr = nfs3_removexattr,
+};
+#endif /* CONFIG_NFS_v3 */
+
+/* Hack for future NFS swap support */
+#ifndef IS_SWAPFILE
+# define IS_SWAPFILE(inode) (0)
+#endif
+
+static int nfs_check_flags(int flags)
+{
+ if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Open file
+ */
+static int
+nfs_file_open(struct inode *inode, struct file *filp)
+{
+ int res;
+
+ dprintk("NFS: open file(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
+ res = nfs_check_flags(filp->f_flags);
+ if (res)
+ return res;
+
+ res = nfs_open(inode, filp);
+ return res;
+}
+
+static int
+nfs_file_release(struct inode *inode, struct file *filp)
+{
+ dprintk("NFS: release(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+ return nfs_release(inode, filp);
+}
+
+/**
+ * nfs_revalidate_size - Revalidate the file size
+ * @inode - pointer to inode struct
+ * @file - pointer to struct file
+ *
+ * Revalidates the file length. This is basically a wrapper around
+ * nfs_revalidate_inode() that takes into account the fact that we may
+ * have cached writes (in which case we don't care about the server's
+ * idea of what the file length is), or O_DIRECT (in which case we
+ * shouldn't trust the cache).
+ */
+static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfs_have_delegated_attributes(inode))
+ goto out_noreval;
+
+ if (filp->f_flags & O_DIRECT)
+ goto force_reval;
+ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ goto force_reval;
+ if (nfs_attribute_timeout(inode))
+ goto force_reval;
+out_noreval:
+ return 0;
+force_reval:
+ return __nfs_revalidate_inode(server, inode);
+}
+
+static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
+{
+ dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ offset, origin);
+
+ /*
+ * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
+ * the cached file length
+ */
+ if (origin != SEEK_SET && origin != SEEK_CUR) {
+ struct inode *inode = filp->f_mapping->host;
+
+ int retval = nfs_revalidate_file_size(inode, filp);
+ if (retval < 0)
+ return (loff_t)retval;
+ }
+
+ return generic_file_llseek(filp, offset, origin);
+}
+
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs_file_flush(struct file *file, fl_owner_t id)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ dprintk("NFS: flush(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /* Flush writes to the server and return any errors */
+ return vfs_fsync(file, 0);
+}
+
+static ssize_t
+nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct dentry * dentry = iocb->ki_filp->f_path.dentry;
+ struct inode * inode = dentry->d_inode;
+ ssize_t result;
+
+ if (iocb->ki_filp->f_flags & O_DIRECT)
+ return nfs_file_direct_read(iocb, iov, nr_segs, pos);
+
+ dprintk("NFS: read(%s/%s, %lu@%lu)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ if (!result) {
+ result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
+ return result;
+}
+
+static ssize_t
+nfs_file_splice_read(struct file *filp, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t count,
+ unsigned int flags)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ ssize_t res;
+
+ dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ (unsigned long) count, (unsigned long long) *ppos);
+
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ if (!res) {
+ res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+ if (res > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+ }
+ return res;
+}
+
+static int
+nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ int status;
+
+ dprintk("NFS: mmap(%s/%s)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name);
+
+ /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ * so we call that before revalidating the mapping
+ */
+ status = generic_file_mmap(file, vma);
+ if (!status) {
+ vma->vm_ops = &nfs_file_vm_ops;
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
+ }
+ return status;
+}
+
+/*
+ * Flush any dirty pages for this process, and check for write errors.
+ * The return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occurred, and hence cause it to
+ * fall back to doing a synchronous write.
+ */
+static int
+nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct inode *inode = dentry->d_inode;
+ int have_error, status;
+ int ret = 0;
+
+ dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ datasync);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ mutex_lock(&inode->i_mutex);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
+ have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+ status = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (status >= 0 && ret < 0)
+ status = ret;
+ have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+ if (have_error)
+ ret = xchg(&ctx->error, 0);
+ if (!ret && status < 0)
+ ret = status;
+ if (!ret && !datasync)
+ /* application has asked for meta-data sync */
+ ret = pnfs_layoutcommit_inode(inode, true);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned len)
+{
+ unsigned int pglen = nfs_page_length(page);
+ unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int end = offset + len;
+
+ if ((file->f_mode & FMODE_READ) && /* open for read? */
+ !PageUptodate(page) && /* Uptodate? */
+ !PagePrivate(page) && /* i/o request already? */
+ pglen && /* valid bytes of file? */
+ (end < pglen || offset)) /* replace all valid bytes? */
+ return 1;
+ return 0;
+}
+
+/*
+ * This does the "real" work of the write. We must allocate and lock the
+ * page to be sent back to the generic routine, which then copies the
+ * data from user space.
+ *
+ * If the writer ends up delaying the write, the writer needs to
+ * increment the page use counts until he is done with the page.
+ */
+static int nfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int once_thru = 0;
+
+ dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
+start:
+ /*
+ * Prevent starvation issues if someone is doing a consistency
+ * sync-to-disk
+ */
+ ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ return ret;
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = nfs_flush_incompatible(file, page);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+ ret = nfs_readpage(file, page);
+ page_cache_release(page);
+ if (!ret)
+ goto start;
+ }
+ return ret;
+}
+
+static int nfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ int status;
+
+ dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!PageUptodate(page)) {
+ unsigned pglen = nfs_page_length(page);
+ unsigned end = offset + len;
+
+ if (pglen == 0) {
+ zero_user_segments(page, 0, offset,
+ end, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ } else if (end >= pglen) {
+ zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ if (offset == 0)
+ SetPageUptodate(page);
+ } else
+ zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ }
+
+ status = nfs_updatepage(file, page, offset, copied);
+
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (status < 0)
+ return status;
+ return copied;
+}
+
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ * page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
+static void nfs_invalidate_page(struct page *page, unsigned long offset)
+{
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
+ if (offset != 0)
+ return;
+ /* Cancel any unstarted writes on this page */
+ nfs_wb_page_cancel(page->mapping->host, page);
+
+ nfs_fscache_invalidate_page(page, page->mapping->host);
+}
+
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
+static int nfs_release_page(struct page *page, gfp_t gfp)
+{
+ struct address_space *mapping = page->mapping;
+
+ dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
+ /* Only do I/O if gfp is a superset of GFP_KERNEL */
+ if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
+ int how = FLUSH_SYNC;
+
+ /* Don't let kswapd deadlock waiting for OOM RPC calls */
+ if (current_is_kswapd())
+ how = 0;
+ nfs_commit_inode(mapping->host, how);
+ }
+ /* If PagePrivate() is set, then the page is not freeable */
+ if (PagePrivate(page))
+ return 0;
+ return nfs_fscache_release_page(page, gfp);
+}
+
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
+static int nfs_launder_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+ inode->i_ino, (long long)page_offset(page));
+
+ nfs_fscache_wait_on_page_write(nfsi, page);
+ return nfs_wb_page(inode, page);
+}
+
+const struct address_space_operations nfs_file_aops = {
+ .readpage = nfs_readpage,
+ .readpages = nfs_readpages,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .writepage = nfs_writepage,
+ .writepages = nfs_writepages,
+ .write_begin = nfs_write_begin,
+ .write_end = nfs_write_end,
+ .invalidatepage = nfs_invalidate_page,
+ .releasepage = nfs_release_page,
+ .direct_IO = nfs_direct_IO,
+ .migratepage = nfs_migrate_page,
+ .launder_page = nfs_launder_page,
+ .error_remove_page = generic_error_remove_page,
+};
+
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct file *filp = vma->vm_file;
+ struct dentry *dentry = filp->f_path.dentry;
+ unsigned pagelen;
+ int ret = VM_FAULT_NOPAGE;
+ struct address_space *mapping;
+
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ filp->f_mapping->host->i_ino,
+ (long long)page_offset(page));
+
+ /* make sure the cache has finished storing the page */
+ nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
+ lock_page(page);
+ mapping = page->mapping;
+ if (mapping != dentry->d_inode->i_mapping)
+ goto out_unlock;
+
+ pagelen = nfs_page_length(page);
+ if (pagelen == 0)
+ goto out_unlock;
+
+ ret = VM_FAULT_LOCKED;
+ if (nfs_flush_incompatible(filp, page) == 0 &&
+ nfs_updatepage(filp, page, 0, pagelen) == 0)
+ goto out;
+
+ ret = VM_FAULT_SIGBUS;
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
+static const struct vm_operations_struct nfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = nfs_vm_page_mkwrite,
+};
+
+static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+{
+ struct nfs_open_context *ctx;
+
+ if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
+ return 1;
+ ctx = nfs_file_open_context(filp);
+ if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
+ return 1;
+ return 0;
+}
+
+static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct dentry * dentry = iocb->ki_filp->f_path.dentry;
+ struct inode * inode = dentry->d_inode;
+ unsigned long written = 0;
+ ssize_t result;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (iocb->ki_filp->f_flags & O_DIRECT)
+ return nfs_file_direct_write(iocb, iov, nr_segs, pos);
+
+ dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ (unsigned long) count, (long long) pos);
+
+ result = -EBUSY;
+ if (IS_SWAPFILE(inode))
+ goto out_swapfile;
+ /*
+ * O_APPEND implies that we must revalidate the file length.
+ */
+ if (iocb->ki_filp->f_flags & O_APPEND) {
+ result = nfs_revalidate_file_size(inode, iocb->ki_filp);
+ if (result)
+ goto out;
+ }
+
+ result = count;
+ if (!count)
+ goto out;
+
+ result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (result > 0)
+ written = result;
+
+ /* Return error values for O_DSYNC and IS_SYNC() */
+ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
+ int err = vfs_fsync(iocb->ki_filp, 0);
+ if (err < 0)
+ result = err;
+ }
+ if (result > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+out:
+ return result;
+
+out_swapfile:
+ printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
+ goto out;
+}
+
+static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *filp, loff_t *ppos,
+ size_t count, unsigned int flags)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ unsigned long written = 0;
+ ssize_t ret;
+
+ dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ (unsigned long) count, (unsigned long long) *ppos);
+
+ /*
+ * The combination of splice and an O_APPEND destination is disallowed.
+ */
+
+ ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+ if (ret > 0)
+ written = ret;
+
+ if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
+ int err = vfs_fsync(filp, 0);
+ if (err < 0)
+ ret = err;
+ }
+ if (ret > 0)
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ return ret;
+}
+
+static int
+do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status = 0;
+ unsigned int saved_type = fl->fl_type;
+
+ /* Try local locking first */
+ posix_test_lock(filp, fl);
+ if (fl->fl_type != F_UNLCK) {
+ /* found a conflict */
+ goto out;
+ }
+ fl->fl_type = saved_type;
+
+ if (nfs_have_delegation(inode, FMODE_READ))
+ goto out_noconflict;
+
+ if (is_local)
+ goto out_noconflict;
+
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+out:
+ return status;
+out_noconflict:
+ fl->fl_type = F_UNLCK;
+ goto out;
+}
+
+static int do_vfs_lock(struct file *file, struct file_lock *fl)
+{
+ int res = 0;
+ switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ case FL_POSIX:
+ res = posix_lock_file_wait(file, fl);
+ break;
+ case FL_FLOCK:
+ res = flock_lock_file_wait(file, fl);
+ break;
+ default:
+ BUG();
+ }
+ return res;
+}
+
+static int
+do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ nfs_sync_mapping(filp->f_mapping);
+
+ /* NOTE: special case
+ * If we're signalled while cleaning up locks on process exit, we
+ * still need to complete the unlock.
+ */
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = do_vfs_lock(filp, fl);
+ return status;
+}
+
+static int
+is_time_granular(struct timespec *ts) {
+ return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
+}
+
+static int
+do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int status;
+
+ /*
+ * Flush all pending writes before doing anything
+ * with locks..
+ */
+ status = nfs_sync_mapping(filp->f_mapping);
+ if (status != 0)
+ goto out;
+
+ /*
+ * Use local locking if mounted with "-onolock" or with appropriate
+ * "-olocal_lock="
+ */
+ if (!is_local)
+ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+ else
+ status = do_vfs_lock(filp, fl);
+ if (status < 0)
+ goto out;
+
+ /*
+ * Revalidate the cache if the server has time stamps granular
+ * enough to detect subsecond changes. Otherwise, clear the
+ * cache to prevent missing any changes.
+ *
+ * This makes locking act as a cache coherency point.
+ */
+ nfs_sync_mapping(filp->f_mapping);
+ if (!nfs_have_delegation(inode, FMODE_READ)) {
+ if (is_time_granular(&NFS_SERVER(inode)->time_delta))
+ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ else
+ nfs_zap_caches(inode);
+ }
+out:
+ return status;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
+ int is_local = 0;
+
+ dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ fl->fl_type, fl->fl_flags,
+ (long long)fl->fl_start, (long long)fl->fl_end);
+
+ nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
+ /* No mandatory locks over NFS */
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+ goto out_err;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
+ is_local = 1;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
+
+ if (IS_GETLK(cmd))
+ ret = do_getlk(filp, cmd, fl, is_local);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl, is_local);
+ else
+ ret = do_setlk(filp, cmd, fl, is_local);
+out_err:
+ return ret;
+}
+
+/*
+ * Lock a (portion of) a file
+ */
+static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = filp->f_mapping->host;
+ int is_local = 0;
+
+ dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ fl->fl_type, fl->fl_flags);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
+ is_local = 1;
+
+ /* We're simulating flock() locks using posix locks on the server */
+ fl->fl_owner = (fl_owner_t)filp;
+ fl->fl_start = 0;
+ fl->fl_end = OFFSET_MAX;
+
+ if (fl->fl_type == F_UNLCK)
+ return do_unlk(filp, cmd, fl, is_local);
+ return do_setlk(filp, cmd, fl, is_local);
+}
+
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
+static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
+{
+ dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name, arg);
+ return -EINVAL;
+}
+
+const struct file_operations nfs_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+ /*
+ * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+ * this point, then something is very wrong
+ */
+ dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+ return -ENOTDIR;
+}
+
+const struct file_operations nfs4_file_operations = {
+ .llseek = nfs_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = nfs_file_read,
+ .aio_write = nfs_file_write,
+ .mmap = nfs_file_mmap,
+ .open = nfs4_file_open,
+ .flush = nfs_file_flush,
+ .release = nfs_file_release,
+ .fsync = nfs_file_fsync,
+ .lock = nfs_lock,
+ .flock = nfs_flock,
+ .splice_read = nfs_file_splice_read,
+ .splice_write = nfs_file_splice_write,
+ .check_flags = nfs_check_flags,
+ .setlease = nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644
index 00000000000..7cf2c4699b0
--- /dev/null
+++ b/fs/nfs/fscache-index.c
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here. The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+ .name = "nfs",
+ .version = 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+ return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+ fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+ uint16_t nfsversion; /* NFS protocol version */
+ uint16_t family; /* address family */
+ uint16_t port; /* IP port */
+ union {
+ struct in_addr ipv4_addr; /* IPv4 address */
+ struct in6_addr ipv6_addr; /* IPv6 address */
+ } addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_client *clp = cookie_netfs_data;
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+ const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+ struct nfs_server_key *key = buffer;
+ uint16_t len = sizeof(struct nfs_server_key);
+
+ key->nfsversion = clp->rpc_ops->version;
+ key->family = clp->cl_addr.ss_family;
+
+ memset(key, 0, len);
+
+ switch (clp->cl_addr.ss_family) {
+ case AF_INET:
+ key->port = sin->sin_port;
+ key->addr[0].ipv4_addr = sin->sin_addr;
+ len += sizeof(key->addr[0].ipv4_addr);
+ break;
+
+ case AF_INET6:
+ key->port = sin6->sin6_port;
+ key->addr[0].ipv6_addr = sin6->sin6_addr;
+ len += sizeof(key->addr[0].ipv6_addr);
+ break;
+
+ default:
+ printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+ clp->cl_addr.ss_family);
+ len = 0;
+ break;
+ }
+
+ return len;
+}
+
+/*
+ * Define the server object for FS-Cache. This is used to describe a server
+ * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+ .name = "NFS.server",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_fscache_key *key;
+ const struct nfs_server *nfss = cookie_netfs_data;
+ uint16_t len;
+
+ key = nfss->fscache_key;
+ len = sizeof(key->key) + key->key.uniq_len;
+ if (len > bufmax) {
+ len = 0;
+ } else {
+ memcpy(buffer, &key->key, sizeof(key->key));
+ memcpy(buffer + sizeof(key->key),
+ key->key.uniquifier, key->key.uniq_len);
+ }
+
+ return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache. This is used to describe a
+ * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+ .name = "NFS.super",
+ .type = FSCACHE_COOKIE_TYPE_INDEX,
+ .get_key = nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode. This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+ struct timespec mtime;
+ struct timespec ctime;
+ loff_t size;
+ u64 change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+ uint16_t nsize;
+
+ /* use the inode's NFS filehandle as the key */
+ nsize = nfsi->fh.size;
+ memcpy(buffer, nfsi->fh.data, nsize);
+ return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+ uint64_t *size)
+{
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+
+ *size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+ void *buffer, uint16_t bufmax)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ const struct nfs_inode *nfsi = cookie_netfs_data;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.size = nfsi->vfs_inode.i_size;
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = nfsi->vfs_inode.i_version;
+
+ if (bufmax > sizeof(auxdata))
+ bufmax = sizeof(auxdata);
+
+ memcpy(buffer, &auxdata, bufmax);
+ return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ * presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen)
+{
+ struct nfs_fscache_inode_auxdata auxdata;
+ struct nfs_inode *nfsi = cookie_netfs_data;
+
+ if (datalen != sizeof(auxdata))
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ memset(&auxdata, 0, sizeof(auxdata));
+ auxdata.size = nfsi->vfs_inode.i_size;
+ auxdata.mtime = nfsi->vfs_inode.i_mtime;
+ auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata.change_attr = nfsi->vfs_inode.i_version;
+
+ if (memcmp(data, &auxdata, datalen) != 0)
+ return FSCACHE_CHECKAUX_OBSOLETE;
+
+ return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ * is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+ struct nfs_inode *nfsi = cookie_netfs_data;
+ struct pagevec pvec;
+ pgoff_t first;
+ int loop, nr_pages;
+
+ pagevec_init(&pvec, 0);
+ first = 0;
+
+ dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+ for (;;) {
+ /* grab a bunch of pages to unmark */
+ nr_pages = pagevec_lookup(&pvec,
+ nfsi->vfs_inode.i_mapping,
+ first,
+ PAGEVEC_SIZE - pagevec_count(&pvec));
+ if (!nr_pages)
+ break;
+
+ for (loop = 0; loop < nr_pages; loop++)
+ ClearPageFsCache(pvec.pages[loop]);
+
+ first = pvec.pages[nr_pages - 1]->index + 1;
+
+ pvec.nr = nr_pages;
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ * cache fails with EIO - in which case the server must be contacted to
+ * retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+ get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ * context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+ if (context)
+ put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache. This is used to describe an inode
+ * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+ .name = "NFS.fh",
+ .type = FSCACHE_COOKIE_TYPE_DATAFILE,
+ .get_key = nfs_fscache_inode_get_key,
+ .get_attr = nfs_fscache_inode_get_attr,
+ .get_aux = nfs_fscache_inode_get_aux,
+ .check_aux = nfs_fscache_inode_check_aux,
+ .now_uncached = nfs_fscache_inode_now_uncached,
+ .get_context = nfs_fh_get_context,
+ .put_context = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644
index 00000000000..419119c371b
--- /dev/null
+++ b/fs/nfs/fscache.c
@@ -0,0 +1,535 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ * cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+ /* create a cache index for looking up filehandles */
+ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+ &nfs_fscache_server_index_def,
+ clp);
+ dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+ dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+ clp, clp->fscache);
+
+ fscache_relinquish_cookie(clp->fscache, 0);
+ clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock. We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ *
+ * The default uniquifier is just an empty string, but it may be overridden
+ * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
+ * superblock across an automount point of some nature.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
+ struct nfs_clone_mount *mntdata)
+{
+ struct nfs_fscache_key *key, *xkey;
+ struct nfs_server *nfss = NFS_SB(sb);
+ struct rb_node **p, *parent;
+ int diff, ulen;
+
+ if (uniq) {
+ ulen = strlen(uniq);
+ } else if (mntdata) {
+ struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
+ if (mnt_s->fscache_key) {
+ uniq = mnt_s->fscache_key->key.uniquifier;
+ ulen = mnt_s->fscache_key->key.uniq_len;
+ }
+ }
+
+ if (!uniq) {
+ uniq = "";
+ ulen = 1;
+ }
+
+ key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+ if (!key)
+ return;
+
+ key->nfs_client = nfss->nfs_client;
+ key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+ key->key.nfs_server.flags = nfss->flags;
+ key->key.nfs_server.rsize = nfss->rsize;
+ key->key.nfs_server.wsize = nfss->wsize;
+ key->key.nfs_server.acregmin = nfss->acregmin;
+ key->key.nfs_server.acregmax = nfss->acregmax;
+ key->key.nfs_server.acdirmin = nfss->acdirmin;
+ key->key.nfs_server.acdirmax = nfss->acdirmax;
+ key->key.nfs_server.fsid = nfss->fsid;
+ key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+ key->key.uniq_len = ulen;
+ memcpy(key->key.uniquifier, uniq, ulen);
+
+ spin_lock(&nfs_fscache_keys_lock);
+ p = &nfs_fscache_keys.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+ if (key->nfs_client < xkey->nfs_client)
+ goto go_left;
+ if (key->nfs_client > xkey->nfs_client)
+ goto go_right;
+
+ diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+
+ if (key->key.uniq_len == 0)
+ goto non_unique;
+ diff = memcmp(key->key.uniquifier,
+ xkey->key.uniquifier,
+ key->key.uniq_len);
+ if (diff < 0)
+ goto go_left;
+ if (diff > 0)
+ goto go_right;
+ goto non_unique;
+
+ go_left:
+ p = &(*p)->rb_left;
+ continue;
+ go_right:
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&key->node, parent, p);
+ rb_insert_color(&key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ nfss->fscache_key = key;
+
+ /* create a cache index for looking up filehandles */
+ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+ &nfs_fscache_super_index_def,
+ nfss);
+ dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+ return;
+
+non_unique:
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(key);
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+ printk(KERN_WARNING "NFS:"
+ " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+ struct nfs_server *nfss = NFS_SB(sb);
+
+ dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+ nfss, nfss->fscache);
+
+ fscache_relinquish_cookie(nfss->fscache, 0);
+ nfss->fscache = NULL;
+
+ if (nfss->fscache_key) {
+ spin_lock(&nfs_fscache_keys_lock);
+ rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+ spin_unlock(&nfs_fscache_keys_lock);
+ kfree(nfss->fscache_key);
+ nfss->fscache_key = NULL;
+ }
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+ NFS_I(inode)->fscache = NULL;
+ if (S_ISREG(inode->i_mode))
+ set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (nfsi->fscache || !NFS_FSCACHE(inode))
+ return;
+
+ if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+ nfsi->fscache = fscache_acquire_cookie(
+ NFS_SB(sb)->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi);
+
+ dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+ sb, nfsi, nfsi->fscache);
+ }
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+ nfsi, nfsi->fscache);
+
+ fscache_relinquish_cookie(nfsi->fscache, 0);
+ nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+ nfsi, nfsi->fscache);
+
+ fscache_relinquish_cookie(nfsi->fscache, 1);
+ nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+ clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+ if (NFS_I(inode)->fscache) {
+ dfprintk(FSCACHE,
+ "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+ /* Need to uncache any pages attached to this inode that
+ * fscache knows about before turning off the cache.
+ */
+ fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
+ nfs_fscache_zap_inode_cookie(inode);
+ }
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+ schedule();
+ return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+ wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+ nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ smp_mb__before_clear_bit();
+ clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+ if (NFS_FSCACHE(inode)) {
+ nfs_fscache_inode_lock(inode);
+ if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+ nfs_fscache_disable_inode_cookie(inode);
+ else
+ nfs_fscache_enable_inode_cookie(inode);
+ nfs_fscache_inode_unlock(inode);
+ }
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_server *nfss = NFS_SERVER(inode);
+ struct fscache_cookie *old = nfsi->fscache;
+
+ nfs_fscache_inode_lock(inode);
+ if (nfsi->fscache) {
+ /* retire the current fscache cache and get a new one */
+ fscache_relinquish_cookie(nfsi->fscache, 1);
+
+ nfsi->fscache = fscache_acquire_cookie(
+ nfss->nfs_client->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi);
+
+ dfprintk(FSCACHE,
+ "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+ nfss, nfsi, old, nfsi->fscache);
+ }
+ nfs_fscache_inode_unlock(inode);
+}
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ if (PageFsCache(page)) {
+ struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+ struct fscache_cookie *cookie = nfsi->fscache;
+
+ BUG_ON(!cookie);
+ dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, nfsi);
+
+ if (!fscache_maybe_release_page(cookie, page, gfp))
+ return 0;
+
+ nfs_add_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ }
+
+ return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = nfsi->fscache;
+
+ BUG_ON(!cookie);
+
+ dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+ cookie, page, nfsi);
+
+ fscache_wait_on_page_write(cookie, page);
+
+ BUG_ON(!PageLocked(page));
+ fscache_uncache_page(cookie, page);
+ nfs_add_fscache_stats(page->mapping->host,
+ NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+ void *context,
+ int error)
+{
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+ page, context, error);
+
+ /* if the read completes with an error, we just unlock the page and let
+ * the VM reissue the readpage */
+ if (!error) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ } else {
+ error = nfs_readpage_async(context, page->mapping->host, page);
+ if (error)
+ unlock_page(page);
+ }
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode, struct page *page)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+ NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+
+ ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+ page,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ GFP_KERNEL);
+
+ switch (ret) {
+ case 0: /* read BIO submitted (page in fscache) */
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache: BIO submitted\n");
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+ return ret;
+
+ case -ENOBUFS: /* inode not in cache */
+ case -ENODATA: /* page not in cache */
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ dfprintk(FSCACHE,
+ "NFS: readpage_from_fscache %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret);
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+ }
+ return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ unsigned npages = *nr_pages;
+ int ret;
+
+ dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+ NFS_I(inode)->fscache, npages, inode);
+
+ ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+ mapping, pages, nr_pages,
+ nfs_readpage_from_fscache_complete,
+ ctx,
+ mapping_gfp_mask(mapping));
+ if (*nr_pages < npages)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+ npages);
+ if (*nr_pages > 0)
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+ *nr_pages);
+
+ switch (ret) {
+ case 0: /* read submitted to the cache for all pages */
+ BUG_ON(!list_empty(pages));
+ BUG_ON(*nr_pages != 0);
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: submitted\n");
+
+ return ret;
+
+ case -ENOBUFS: /* some pages aren't cached and can't be */
+ case -ENODATA: /* some pages aren't cached */
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+ return 1;
+
+ default:
+ dfprintk(FSCACHE,
+ "NFS: nfs_getpages_from_fscache: ret %d\n", ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+ int ret;
+
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+ NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+
+ ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+ dfprintk(FSCACHE,
+ "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+ page, page->index, page->flags, ret);
+
+ if (ret != 0) {
+ fscache_uncache_page(NFS_I(inode)->fscache, page);
+ nfs_add_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+ nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+ } else {
+ nfs_add_fscache_stats(inode,
+ NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+ }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644
index 00000000000..b9c572d0679
--- /dev/null
+++ b/fs/nfs/fscache.h
@@ -0,0 +1,222 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+ struct rb_node node;
+ struct nfs_client *nfs_client; /* the server */
+
+ /* the elements of the unique key - as used by nfs_compare_super() and
+ * nfs_compare_mount_options() to distinguish superblocks */
+ struct {
+ struct {
+ unsigned long s_flags; /* various flags
+ * (& NFS_MS_MASK) */
+ } super;
+
+ struct {
+ struct nfs_fsid fsid;
+ int flags;
+ unsigned int rsize; /* read size */
+ unsigned int wsize; /* write size */
+ unsigned int acregmin; /* attr cache timeouts */
+ unsigned int acregmax;
+ unsigned int acdirmin;
+ unsigned int acdirmax;
+ } nfs_server;
+
+ struct {
+ rpc_authflavor_t au_flavor;
+ } rpc_auth;
+
+ /* uniquifier - can be used if nfs_server.flags includes
+ * NFS_MOUNT_UNSHARED */
+ u8 uniq_len;
+ char uniquifier[0];
+ } key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+ const char *,
+ struct nfs_clone_mount *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+ struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+ struct inode *, struct address_space *,
+ struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page)
+{
+ if (PageFsCache(page))
+ fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode)
+{
+ if (PageFsCache(page))
+ __nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpage_from_fscache(ctx, inode, page);
+ return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ if (NFS_I(inode)->fscache)
+ return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+ nr_pages);
+ return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page,
+ int sync)
+{
+ if (PageFsCache(page))
+ __nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+ return "yes";
+ return "no ";
+}
+
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_get_super_cookie(
+ struct super_block *sb,
+ const char *uniq,
+ struct nfs_clone_mount *mntdata)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+ struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+ struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+ struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page)
+{
+ return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned *nr_pages)
+{
+ return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+ struct page *page, int sync) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+ return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 00000000000..dcb61548887
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,267 @@
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_CLIENT
+
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+ /* The mntroot acts as the dummy root dentry for this superblock */
+ if (sb->s_root == NULL) {
+ sb->s_root = d_alloc_root(inode);
+ if (sb->s_root == NULL) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ ihold(inode);
+ /*
+ * Ensure that this dentry is invisible to d_find_alias().
+ * Otherwise, it may be spliced into the tree by
+ * d_materialise_unique if a parent directory from the same
+ * filesystem gets mounted at a later time.
+ * This again causes shrink_dcache_for_umount_subtree() to
+ * Oops, since the test for IS_ROOT() will fail.
+ */
+ spin_lock(&sb->s_root->d_inode->i_lock);
+ spin_lock(&sb->s_root->d_lock);
+ list_del_init(&sb->s_root->d_alias);
+ spin_unlock(&sb->s_root->d_lock);
+ spin_unlock(&sb->s_root->d_inode->i_lock);
+ }
+ return 0;
+}
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+ const char *devname)
+{
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_fsinfo fsinfo;
+ struct dentry *ret;
+ struct inode *inode;
+ void *name = kstrdup(devname, GFP_KERNEL);
+ int error;
+
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+
+ /* get the actual root for this mount */
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL) {
+ kfree(name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+ if (error < 0) {
+ dprintk("nfs_get_root: getattr error = %d\n", -error);
+ ret = ERR_PTR(error);
+ goto out;
+ }
+
+ inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+ if (IS_ERR(inode)) {
+ dprintk("nfs_get_root: get root inode failed\n");
+ ret = ERR_CAST(inode);
+ goto out;
+ }
+
+ error = nfs_superblock_set_dummy_root(sb, inode);
+ if (error != 0) {
+ ret = ERR_PTR(error);
+ goto out;
+ }
+
+ /* root dentries normally start off anonymous and get spliced in later
+ * if the dentry tree reaches them; however if the dentry already
+ * exists, we'll pick it up at this point and use it as the root
+ */
+ ret = d_obtain_alias(inode);
+ if (IS_ERR(ret)) {
+ dprintk("nfs_get_root: get root dentry failed\n");
+ goto out;
+ }
+
+ security_d_instantiate(ret, inode);
+ spin_lock(&ret->d_lock);
+ if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+ ret->d_fsdata = name;
+ name = NULL;
+ }
+ spin_unlock(&ret->d_lock);
+out:
+ if (name)
+ kfree(name);
+ nfs_free_fattr(fsinfo.fattr);
+ return ret;
+}
+
+#ifdef CONFIG_NFS_V4
+
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
+{
+ struct nfs_fsinfo fsinfo;
+ int ret = -ENOMEM;
+
+ dprintk("--> nfs4_get_rootfh()\n");
+
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ goto out;
+
+ /* Start by getting the root filehandle from the server */
+ ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+ if (ret < 0) {
+ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+ goto out;
+ }
+
+ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+ || !S_ISDIR(fsinfo.fattr->mode)) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
+ " getroot encountered non-directory\n");
+ ret = -ENOTDIR;
+ goto out;
+ }
+
+ if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
+ " getroot obtained referral\n");
+ ret = -EREMOTE;
+ goto out;
+ }
+
+ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+ nfs_free_fattr(fsinfo.fattr);
+ dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+ return ret;
+}
+
+/*
+ * get an NFS4 root dentry from the root filehandle
+ */
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+ const char *devname)
+{
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_fattr *fattr = NULL;
+ struct dentry *ret;
+ struct inode *inode;
+ void *name = kstrdup(devname, GFP_KERNEL);
+ int error;
+
+ dprintk("--> nfs4_get_root()\n");
+
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+
+ /* get the info about the server and filesystem */
+ error = nfs4_server_capabilities(server, mntfh);
+ if (error < 0) {
+ dprintk("nfs_get_root: getcaps error = %d\n",
+ -error);
+ kfree(name);
+ return ERR_PTR(error);
+ }
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL) {
+ kfree(name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* get the actual root for this mount */
+ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
+ if (error < 0) {
+ dprintk("nfs_get_root: getattr error = %d\n", -error);
+ ret = ERR_PTR(error);
+ goto out;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
+
+ inode = nfs_fhget(sb, mntfh, fattr);
+ if (IS_ERR(inode)) {
+ dprintk("nfs_get_root: get root inode failed\n");
+ ret = ERR_CAST(inode);
+ goto out;
+ }
+
+ error = nfs_superblock_set_dummy_root(sb, inode);
+ if (error != 0) {
+ ret = ERR_PTR(error);
+ goto out;
+ }
+
+ /* root dentries normally start off anonymous and get spliced in later
+ * if the dentry tree reaches them; however if the dentry already
+ * exists, we'll pick it up at this point and use it as the root
+ */
+ ret = d_obtain_alias(inode);
+ if (IS_ERR(ret)) {
+ dprintk("nfs_get_root: get root dentry failed\n");
+ goto out;
+ }
+
+ security_d_instantiate(ret, inode);
+ spin_lock(&ret->d_lock);
+ if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+ ret->d_fsdata = name;
+ name = NULL;
+ }
+ spin_unlock(&ret->d_lock);
+out:
+ if (name)
+ kfree(name);
+ nfs_free_fattr(fattr);
+ dprintk("<-- nfs4_get_root()\n");
+ return ret;
+}
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
new file mode 100644
index 00000000000..24228135c1a
--- /dev/null
+++ b/fs/nfs/idmap.c
@@ -0,0 +1,847 @@
+/*
+ * fs/nfs/idmap.c
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+#include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/rcupdate.h>
+#include <linux/err.h>
+#include <keys/user-type.h>
+
+/* include files needed by legacy idmapper */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
+
+#define NFS_UINT_MAXLEN 11
+#define IDMAP_HASH_SZ 128
+
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600 * HZ;
+const struct cred *id_resolver_cache;
+
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name)
+{
+ fattr->owner_name = owner_name;
+ fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+ kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+ fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+ kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *owner = fattr->owner_name;
+ __u32 uid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+ return false;
+ if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+ fattr->uid = uid;
+ fattr->valid |= NFS_ATTR_FATTR_OWNER;
+ }
+ return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ struct nfs4_string *group = fattr->group_name;
+ __u32 gid;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+ return false;
+ if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+ fattr->gid = gid;
+ fattr->valid |= NFS_ATTR_FATTR_GROUP;
+ }
+ return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+ nfs_fattr_free_owner_name(fattr);
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+ nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+ if (nfs_fattr_map_owner_name(server, fattr))
+ nfs_fattr_free_owner_name(fattr);
+ if (nfs_fattr_map_group_name(server, fattr))
+ nfs_fattr_free_group_name(fattr);
+}
+
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+ unsigned long val;
+ char buf[16];
+
+ if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+ return 0;
+ memcpy(buf, name, namelen);
+ buf[namelen] = '\0';
+ if (strict_strtoul(buf, 0, &val) != 0)
+ return 0;
+ *res = val;
+ return 1;
+}
+
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+ return snprintf(buf, buflen, "%u", id);
+}
+
+struct key_type key_type_id_resolver = {
+ .name = "id_resolver",
+ .instantiate = user_instantiate,
+ .match = user_match,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = user_describe,
+ .read = user_read,
+};
+
+int nfs_idmap_init(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret = 0;
+
+ printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name);
+
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&key_type_id_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ id_resolver_cache = cred;
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void nfs_idmap_quit(void)
+{
+ key_revoke(id_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_id_resolver);
+ put_cred(id_resolver_cache);
+}
+
+/*
+ * Assemble the description to pass to request_key()
+ * This function will allocate a new string and update dest to point
+ * at it. The caller is responsible for freeing dest.
+ *
+ * On error 0 is returned. Otherwise, the length of dest is returned.
+ */
+static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
+ const char *type, size_t typelen, char **desc)
+{
+ char *cp;
+ size_t desclen = typelen + namelen + 2;
+
+ *desc = kmalloc(desclen, GFP_KERNEL);
+ if (!*desc)
+ return -ENOMEM;
+
+ cp = *desc;
+ memcpy(cp, type, typelen);
+ cp += typelen;
+ *cp++ = ':';
+
+ memcpy(cp, name, namelen);
+ cp += namelen;
+ *cp = '\0';
+ return desclen;
+}
+
+static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
+ const char *type, void *data, size_t data_size)
+{
+ const struct cred *saved_cred;
+ struct key *rkey;
+ char *desc;
+ struct user_key_payload *payload;
+ ssize_t ret;
+
+ ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
+ if (ret <= 0)
+ goto out;
+
+ saved_cred = override_creds(id_resolver_cache);
+ rkey = request_key(&key_type_id_resolver, desc, "");
+ revert_creds(saved_cred);
+ kfree(desc);
+ if (IS_ERR(rkey)) {
+ ret = PTR_ERR(rkey);
+ goto out;
+ }
+
+ rcu_read_lock();
+ rkey->perm |= KEY_USR_VIEW|KEY_USR_WRITE;
+
+ ret = key_validate(rkey);
+ if (ret < 0)
+ goto out_up;
+
+ payload = rcu_dereference(rkey->payload.data);
+ if (IS_ERR_OR_NULL(payload)) {
+ ret = PTR_ERR(payload);
+ goto out_up;
+ }
+
+ ret = payload->datalen;
+ if (ret > 0 && ret <= data_size)
+ memcpy(data, payload->data, ret);
+ else
+ ret = -EINVAL;
+
+out_up:
+ rcu_read_unlock();
+ key_put(rkey);
+out:
+ return ret;
+}
+
+
+/* ID -> Name */
+static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ int id_len;
+ ssize_t ret;
+
+ id_len = snprintf(id_str, sizeof(id_str), "%u", id);
+ ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen);
+ if (ret < 0)
+ return -EINVAL;
+ return ret;
+}
+
+/* Name -> ID */
+static int nfs_idmap_lookup_id(const char *name, size_t namelen,
+ const char *type, __u32 *id)
+{
+ char id_str[NFS_UINT_MAXLEN];
+ long id_long;
+ ssize_t data_size;
+ int ret = 0;
+
+ data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN);
+ if (data_size <= 0) {
+ ret = -EINVAL;
+ } else {
+ ret = strict_strtol(id_str, 10, &id_long);
+ *id = (__u32)id_long;
+ }
+ return ret;
+}
+
+/* idmap classic begins here */
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+ char *endp;
+ int num = simple_strtol(val, &endp, 0);
+ int jif = num * HZ;
+ if (endp == val || *endp || num < 0 || jif < num)
+ return -EINVAL;
+ *((int *)kp->arg) = jif;
+ return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+ &nfs_idmap_cache_timeout, 0644);
+
+struct idmap_hashent {
+ unsigned long ih_expires;
+ __u32 ih_id;
+ size_t ih_namelen;
+ const char *ih_name;
+};
+
+struct idmap_hashtable {
+ __u8 h_type;
+ struct idmap_hashent *h_entries;
+};
+
+struct idmap {
+ struct dentry *idmap_dentry;
+ wait_queue_head_t idmap_wq;
+ struct idmap_msg idmap_im;
+ struct mutex idmap_lock; /* Serializes upcalls */
+ struct mutex idmap_im_lock; /* Protects the hashtable */
+ struct idmap_hashtable idmap_user_hash;
+ struct idmap_hashtable idmap_group_hash;
+};
+
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+ size_t);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+
+static unsigned int fnvhash32(const void *, size_t);
+
+static const struct rpc_pipe_ops idmap_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = idmap_pipe_downcall,
+ .destroy_msg = idmap_pipe_destroy_msg,
+};
+
+int
+nfs_idmap_new(struct nfs_client *clp)
+{
+ struct idmap *idmap;
+ int error;
+
+ BUG_ON(clp->cl_idmap != NULL);
+
+ idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+ if (idmap == NULL)
+ return -ENOMEM;
+
+ idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+ "idmap", idmap, &idmap_upcall_ops, 0);
+ if (IS_ERR(idmap->idmap_dentry)) {
+ error = PTR_ERR(idmap->idmap_dentry);
+ kfree(idmap);
+ return error;
+ }
+
+ mutex_init(&idmap->idmap_lock);
+ mutex_init(&idmap->idmap_im_lock);
+ init_waitqueue_head(&idmap->idmap_wq);
+ idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
+ idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
+
+ clp->cl_idmap = idmap;
+ return 0;
+}
+
+static void
+idmap_alloc_hashtable(struct idmap_hashtable *h)
+{
+ if (h->h_entries != NULL)
+ return;
+ h->h_entries = kcalloc(IDMAP_HASH_SZ,
+ sizeof(*h->h_entries),
+ GFP_KERNEL);
+}
+
+static void
+idmap_free_hashtable(struct idmap_hashtable *h)
+{
+ int i;
+
+ if (h->h_entries == NULL)
+ return;
+ for (i = 0; i < IDMAP_HASH_SZ; i++)
+ kfree(h->h_entries[i].ih_name);
+ kfree(h->h_entries);
+}
+
+void
+nfs_idmap_delete(struct nfs_client *clp)
+{
+ struct idmap *idmap = clp->cl_idmap;
+
+ if (!idmap)
+ return;
+ rpc_unlink(idmap->idmap_dentry);
+ clp->cl_idmap = NULL;
+ idmap_free_hashtable(&idmap->idmap_user_hash);
+ idmap_free_hashtable(&idmap->idmap_group_hash);
+ kfree(idmap);
+}
+
+/*
+ * Helper routines for manipulating the hashtable
+ */
+static inline struct idmap_hashent *
+idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
+{
+ if (h->h_entries == NULL)
+ return NULL;
+ return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
+}
+
+static struct idmap_hashent *
+idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
+{
+ struct idmap_hashent *he = idmap_name_hash(h, name, len);
+
+ if (he == NULL)
+ return NULL;
+ if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
+ return NULL;
+ if (time_after(jiffies, he->ih_expires))
+ return NULL;
+ return he;
+}
+
+static inline struct idmap_hashent *
+idmap_id_hash(struct idmap_hashtable* h, __u32 id)
+{
+ if (h->h_entries == NULL)
+ return NULL;
+ return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ];
+}
+
+static struct idmap_hashent *
+idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
+{
+ struct idmap_hashent *he = idmap_id_hash(h, id);
+
+ if (he == NULL)
+ return NULL;
+ if (he->ih_id != id || he->ih_namelen == 0)
+ return NULL;
+ if (time_after(jiffies, he->ih_expires))
+ return NULL;
+ return he;
+}
+
+/*
+ * Routines for allocating new entries in the hashtable.
+ * For now, we just have 1 entry per bucket, so it's all
+ * pretty trivial.
+ */
+static inline struct idmap_hashent *
+idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
+{
+ idmap_alloc_hashtable(h);
+ return idmap_name_hash(h, name, len);
+}
+
+static inline struct idmap_hashent *
+idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
+{
+ idmap_alloc_hashtable(h);
+ return idmap_id_hash(h, id);
+}
+
+static void
+idmap_update_entry(struct idmap_hashent *he, const char *name,
+ size_t namelen, __u32 id)
+{
+ char *str = kmalloc(namelen + 1, GFP_KERNEL);
+ if (str == NULL)
+ return;
+ kfree(he->ih_name);
+ he->ih_id = id;
+ memcpy(str, name, namelen);
+ str[namelen] = '\0';
+ he->ih_name = str;
+ he->ih_namelen = namelen;
+ he->ih_expires = jiffies + nfs_idmap_cache_timeout;
+}
+
+/*
+ * Name -> ID
+ */
+static int
+nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
+ const char *name, size_t namelen, __u32 *id)
+{
+ struct rpc_pipe_msg msg;
+ struct idmap_msg *im;
+ struct idmap_hashent *he;
+ DECLARE_WAITQUEUE(wq, current);
+ int ret = -EIO;
+
+ im = &idmap->idmap_im;
+
+ /*
+ * String sanity checks
+ * Note that the userland daemon expects NUL terminated strings
+ */
+ for (;;) {
+ if (namelen == 0)
+ return -EINVAL;
+ if (name[namelen-1] != '\0')
+ break;
+ namelen--;
+ }
+ if (namelen >= IDMAP_NAMESZ)
+ return -EINVAL;
+
+ mutex_lock(&idmap->idmap_lock);
+ mutex_lock(&idmap->idmap_im_lock);
+
+ he = idmap_lookup_name(h, name, namelen);
+ if (he != NULL) {
+ *id = he->ih_id;
+ ret = 0;
+ goto out;
+ }
+
+ memset(im, 0, sizeof(*im));
+ memcpy(im->im_name, name, namelen);
+
+ im->im_type = h->h_type;
+ im->im_conv = IDMAP_CONV_NAMETOID;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = im;
+ msg.len = sizeof(*im);
+
+ add_wait_queue(&idmap->idmap_wq, &wq);
+ if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+ remove_wait_queue(&idmap->idmap_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&idmap->idmap_im_lock);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&idmap->idmap_wq, &wq);
+ mutex_lock(&idmap->idmap_im_lock);
+
+ if (im->im_status & IDMAP_STATUS_SUCCESS) {
+ *id = im->im_id;
+ ret = 0;
+ }
+
+ out:
+ memset(im, 0, sizeof(*im));
+ mutex_unlock(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
+}
+
+/*
+ * ID -> Name
+ */
+static int
+nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
+ __u32 id, char *name)
+{
+ struct rpc_pipe_msg msg;
+ struct idmap_msg *im;
+ struct idmap_hashent *he;
+ DECLARE_WAITQUEUE(wq, current);
+ int ret = -EIO;
+ unsigned int len;
+
+ im = &idmap->idmap_im;
+
+ mutex_lock(&idmap->idmap_lock);
+ mutex_lock(&idmap->idmap_im_lock);
+
+ he = idmap_lookup_id(h, id);
+ if (he) {
+ memcpy(name, he->ih_name, he->ih_namelen);
+ ret = he->ih_namelen;
+ goto out;
+ }
+
+ memset(im, 0, sizeof(*im));
+ im->im_type = h->h_type;
+ im->im_conv = IDMAP_CONV_IDTONAME;
+ im->im_id = id;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = im;
+ msg.len = sizeof(*im);
+
+ add_wait_queue(&idmap->idmap_wq, &wq);
+
+ if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) {
+ remove_wait_queue(&idmap->idmap_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&idmap->idmap_im_lock);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&idmap->idmap_wq, &wq);
+ mutex_lock(&idmap->idmap_im_lock);
+
+ if (im->im_status & IDMAP_STATUS_SUCCESS) {
+ if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0)
+ goto out;
+ memcpy(name, im->im_name, len);
+ ret = len;
+ }
+
+ out:
+ memset(im, 0, sizeof(*im));
+ mutex_unlock(&idmap->idmap_im_lock);
+ mutex_unlock(&idmap->idmap_lock);
+ return ret;
+}
+
+static ssize_t
+idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+ struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_msg im_in, *im = &idmap->idmap_im;
+ struct idmap_hashtable *h;
+ struct idmap_hashent *he = NULL;
+ size_t namelen_in;
+ int ret;
+
+ if (mlen != sizeof(im_in))
+ return -ENOSPC;
+
+ if (copy_from_user(&im_in, src, mlen) != 0)
+ return -EFAULT;
+
+ mutex_lock(&idmap->idmap_im_lock);
+
+ ret = mlen;
+ im->im_status = im_in.im_status;
+ /* If we got an error, terminate now, and wake up pending upcalls */
+ if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) {
+ wake_up(&idmap->idmap_wq);
+ goto out;
+ }
+
+ /* Sanity checking of strings */
+ ret = -EINVAL;
+ namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ);
+ if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ)
+ goto out;
+
+ switch (im_in.im_type) {
+ case IDMAP_TYPE_USER:
+ h = &idmap->idmap_user_hash;
+ break;
+ case IDMAP_TYPE_GROUP:
+ h = &idmap->idmap_group_hash;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (im_in.im_conv) {
+ case IDMAP_CONV_IDTONAME:
+ /* Did we match the current upcall? */
+ if (im->im_conv == IDMAP_CONV_IDTONAME
+ && im->im_type == im_in.im_type
+ && im->im_id == im_in.im_id) {
+ /* Yes: copy string, including the terminating '\0' */
+ memcpy(im->im_name, im_in.im_name, namelen_in);
+ im->im_name[namelen_in] = '\0';
+ wake_up(&idmap->idmap_wq);
+ }
+ he = idmap_alloc_id(h, im_in.im_id);
+ break;
+ case IDMAP_CONV_NAMETOID:
+ /* Did we match the current upcall? */
+ if (im->im_conv == IDMAP_CONV_NAMETOID
+ && im->im_type == im_in.im_type
+ && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in
+ && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) {
+ im->im_id = im_in.im_id;
+ wake_up(&idmap->idmap_wq);
+ }
+ he = idmap_alloc_name(h, im_in.im_name, namelen_in);
+ break;
+ default:
+ goto out;
+ }
+
+ /* If the entry is valid, also copy it to the cache */
+ if (he != NULL)
+ idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id);
+ ret = mlen;
+out:
+ mutex_unlock(&idmap->idmap_im_lock);
+ return ret;
+}
+
+static void
+idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct idmap_msg *im = msg->data;
+ struct idmap *idmap = container_of(im, struct idmap, idmap_im);
+
+ if (msg->errno >= 0)
+ return;
+ mutex_lock(&idmap->idmap_im_lock);
+ im->im_status = IDMAP_STATUS_LOOKUPFAIL;
+ wake_up(&idmap->idmap_wq);
+ mutex_unlock(&idmap->idmap_im_lock);
+}
+
+/*
+ * Fowler/Noll/Vo hash
+ * http://www.isthe.com/chongo/tech/comp/fnv/
+ */
+
+#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */
+#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */
+
+static unsigned int fnvhash32(const void *buf, size_t buflen)
+{
+ const unsigned char *p, *end = (const unsigned char *)buf + buflen;
+ unsigned int hash = FNV_1_32;
+
+ for (p = buf; p < end; p++) {
+ hash *= FNV_P_32;
+ hash ^= (unsigned int)*p;
+ }
+
+ return hash;
+}
+
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+
+ if (nfs_map_string_to_numeric(name, namelen, uid))
+ return 0;
+ ret = nfs_idmap_lookup_id(name, namelen, "uid", uid);
+ if (ret < 0)
+ ret = nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
+ return ret;
+}
+
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+
+ if (nfs_map_string_to_numeric(name, namelen, gid))
+ return 0;
+ ret = nfs_idmap_lookup_id(name, namelen, "gid", gid);
+ if (ret < 0)
+ ret = nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, gid);
+ return ret;
+}
+
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) {
+ ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+ if (ret < 0)
+ ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+ }
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(uid, buf, buflen);
+ return ret;
+}
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
+{
+ struct idmap *idmap = server->nfs_client->cl_idmap;
+ int ret = -EINVAL;
+
+ if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) {
+ ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+ if (ret < 0)
+ ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, gid, buf);
+ }
+ if (ret < 0)
+ ret = nfs_map_numeric_to_string(gid, buf, buflen);
+ return ret;
+}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
new file mode 100644
index 00000000000..f649fba8c38
--- /dev/null
+++ b/fs/nfs/inode.c
@@ -0,0 +1,1660 @@
+/*
+ * linux/fs/nfs/inode.c
+ *
+ * Copyright (C) 1992 Rick Sladkey
+ *
+ * nfs inode and superblock handling functions
+ *
+ * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
+ * experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ * J.S.Peatfield@damtp.cam.ac.uk
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/freezer.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#include "fscache.h"
+#include "dns_resolve.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
+
+/* Default is to see 64-bit inode numbers */
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+
+static void nfs_invalidate_inode(struct inode *);
+static int nfs_update_inode(struct inode *, struct nfs_fattr *);
+
+static struct kmem_cache * nfs_inode_cachep;
+
+static inline unsigned long
+nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
+{
+ return nfs_fileid_to_ino_t(fattr->fileid);
+}
+
+/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ freezable_schedule();
+ return 0;
+}
+
+/**
+ * nfs_compat_user_ino64 - returns the user-visible inode number
+ * @fileid: 64-bit fileid
+ *
+ * This function returns a 32-bit inode number if the boot parameter
+ * nfs.enable_ino64 is zero.
+ */
+u64 nfs_compat_user_ino64(u64 fileid)
+{
+#ifdef CONFIG_COMPAT
+ compat_ulong_t ino;
+#else
+ unsigned long ino;
+#endif
+
+ if (enable_ino64)
+ return fileid;
+ ino = fileid;
+ if (sizeof(ino) < sizeof(fileid))
+ ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8;
+ return ino;
+}
+
+static void nfs_clear_inode(struct inode *inode)
+{
+ /*
+ * The following should never happen...
+ */
+ BUG_ON(nfs_have_writebacks(inode));
+ BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+ nfs_zap_acl_cache(inode);
+ nfs_access_zap_cache(inode);
+ nfs_fscache_release_inode_cookie(inode);
+}
+
+void nfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages(&inode->i_data, 0);
+ end_writeback(inode);
+ nfs_clear_inode(inode);
+}
+
+/**
+ * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
+ */
+int nfs_sync_mapping(struct address_space *mapping)
+{
+ int ret = 0;
+
+ if (mapping->nrpages != 0) {
+ unmap_mapping_range(mapping, 0, 0, 0);
+ ret = nfs_wb_all(mapping->host);
+ }
+ return ret;
+}
+
+/*
+ * Invalidate the local caches
+ */
+static void nfs_zap_caches_locked(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int mode = inode->i_mode;
+
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = jiffies;
+
+ memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+ else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+}
+
+void nfs_zap_caches(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_zap_caches_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+ if (mapping->nrpages != 0) {
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+void nfs_zap_acl_cache(struct inode *inode)
+{
+ void (*clear_acl_cache)(struct inode *);
+
+ clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+ if (clear_acl_cache != NULL)
+ clear_acl_cache(inode);
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_invalidate_atime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Invalidate, but do not unhash, the inode.
+ * NB: must be called with inode->i_lock held!
+ */
+static void nfs_invalidate_inode(struct inode *inode)
+{
+ set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_zap_caches_locked(inode);
+}
+
+struct nfs_find_desc {
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+};
+
+/*
+ * In NFSv3 we can have 64bit inode numbers. In order to support
+ * this, and re-exported directories (also seen in NFSv2)
+ * we are forced to allow 2 different inodes to have the same
+ * i_ino.
+ */
+static int
+nfs_find_actor(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_fh *fh = desc->fh;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ if (NFS_FILEID(inode) != fattr->fileid)
+ return 0;
+ if (nfs_compare_fh(NFS_FH(inode), fh))
+ return 0;
+ if (is_bad_inode(inode) || NFS_STALE(inode))
+ return 0;
+ return 1;
+}
+
+static int
+nfs_init_locked(struct inode *inode, void *opaque)
+{
+ struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque;
+ struct nfs_fattr *fattr = desc->fattr;
+
+ set_nfs_fileid(inode, fattr->fileid);
+ nfs_copy_fh(NFS_FH(inode), desc->fh);
+ return 0;
+}
+
+/*
+ * This is our front-end to iget that looks up inodes by file handle
+ * instead of inode number.
+ */
+struct inode *
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+ struct nfs_find_desc desc = {
+ .fh = fh,
+ .fattr = fattr
+ };
+ struct inode *inode = ERR_PTR(-ENOENT);
+ unsigned long hash;
+
+ nfs_attr_check_mountpoint(sb, fattr);
+
+ if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
+ !nfs_attr_use_mounted_on_fileid(fattr))
+ goto out_no_inode;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
+ goto out_no_inode;
+
+ hash = nfs_fattr_to_ino_t(fattr);
+
+ inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc);
+ if (inode == NULL) {
+ inode = ERR_PTR(-ENOMEM);
+ goto out_no_inode;
+ }
+
+ if (inode->i_state & I_NEW) {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long now = jiffies;
+
+ /* We set i_ino for the few things that still rely on it,
+ * such as stat(2) */
+ inode->i_ino = hash;
+
+ /* We can't support update_atime(), since the server will reset it */
+ inode->i_flags |= S_NOATIME|S_NOCMTIME;
+ inode->i_mode = fattr->mode;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && nfs_server_capable(inode, NFS_CAP_MODE))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ /* Why so? Because we want revalidate for devices/FIFOs, and
+ * that's precisely what we have in nfs_file_inode_operations.
+ */
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+ inode->i_data.a_ops = &nfs_file_aops;
+ inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+ inode->i_fop = &nfs_dir_operations;
+ inode->i_data.a_ops = &nfs_dir_aops;
+ if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
+ set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+ /* Deal with crossing mountpoints */
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+ fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ inode->i_op = &nfs_referral_inode_operations;
+ else
+ inode->i_op = &nfs_mountpoint_inode_operations;
+ inode->i_fop = NULL;
+ inode->i_flags |= S_AUTOMOUNT;
+ }
+ } else if (S_ISLNK(inode->i_mode))
+ inode->i_op = &nfs_symlink_inode_operations;
+ else
+ init_special_inode(inode, inode->i_mode, fattr->rdev);
+
+ memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+ memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ inode->i_version = 0;
+ inode->i_size = 0;
+ clear_nlink(inode);
+ inode->i_uid = -2;
+ inode->i_gid = -2;
+ inode->i_blocks = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ nfsi->attr_gencount = fattr->gencount;
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ inode->i_version = fattr->change_attr;
+ else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE;
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+ set_nlink(inode, fattr->nlink);
+ else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+ inode->i_uid = fattr->uid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+ inode->i_gid = fattr->gid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ }
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ nfsi->access_cache = RB_ROOT;
+
+ nfs_fscache_init_inode_cookie(inode);
+
+ unlock_new_inode(inode);
+ } else
+ nfs_refresh_inode(inode, fattr);
+ dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n",
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ atomic_read(&inode->i_count));
+
+out:
+ return inode;
+
+out_no_inode:
+ dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));
+ goto out;
+}
+
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
+
+int
+nfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct nfs_fattr *fattr;
+ int error = -ENOMEM;
+
+ nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
+
+ /* skip mode change if it's just for clearing setuid/setgid */
+ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ attr->ia_valid &= ~ATTR_MODE;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode))
+ attr->ia_valid &= ~ATTR_SIZE;
+ }
+
+ /* Optimization: if the end result is no change, don't RPC */
+ attr->ia_valid &= NFS_VALID_ATTRS;
+ if ((attr->ia_valid & ~ATTR_FILE) == 0)
+ return 0;
+
+ /* Write all dirty data */
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+ /*
+ * Return any delegations if we're going to change ACLs
+ */
+ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
+ nfs_inode_return_delegation(inode);
+ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
+ if (error == 0)
+ nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
+ return error;
+}
+
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ loff_t oldsize;
+ int err;
+
+ err = inode_newsize_ok(inode, offset);
+ if (err)
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ oldsize = inode->i_size;
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ truncate_pagecache(inode, oldsize, offset);
+out:
+ return err;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ * it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+{
+ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+ spin_lock(&inode->i_lock);
+ if ((attr->ia_valid & ATTR_MODE) != 0) {
+ int mode = attr->ia_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
+ }
+ if ((attr->ia_valid & ATTR_UID) != 0)
+ inode->i_uid = attr->ia_uid;
+ if ((attr->ia_valid & ATTR_GID) != 0)
+ inode->i_gid = attr->ia_gid;
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
+ }
+ if ((attr->ia_valid & ATTR_SIZE) != 0) {
+ nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
+ nfs_vmtruncate(inode, attr->ia_size);
+ }
+}
+
+int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+ int err;
+
+ /* Flush out writes to the server in order to update c/mtime. */
+ if (S_ISREG(inode->i_mode)) {
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * We may force a getattr if the user cares about atime.
+ *
+ * Note that we only have to check the vfsmount flags here:
+ * - NFS always sets S_NOATIME by so checking it would give a
+ * bogus result
+ * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+ * no point in checking those.
+ */
+ if ((mnt->mnt_flags & MNT_NOATIME) ||
+ ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ need_atime = 0;
+
+ if (need_atime)
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ else
+ err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (!err) {
+ generic_fillattr(inode, stat);
+ stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+ }
+out:
+ return err;
+}
+
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+ atomic_set(&l_ctx->count, 1);
+ l_ctx->lockowner = current->files;
+ l_ctx->pid = current->tgid;
+ INIT_LIST_HEAD(&l_ctx->list);
+}
+
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *pos;
+
+ list_for_each_entry(pos, &ctx->lock_context.list, list) {
+ if (pos->lockowner != current->files)
+ continue;
+ if (pos->pid != current->tgid)
+ continue;
+ atomic_inc(&pos->count);
+ return pos;
+ }
+ return NULL;
+}
+
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+ struct nfs_lock_context *res, *new = NULL;
+ struct inode *inode = ctx->dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ res = __nfs_find_lock_context(ctx);
+ if (res == NULL) {
+ spin_unlock(&inode->i_lock);
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (new == NULL)
+ return NULL;
+ nfs_init_lock_context(new);
+ spin_lock(&inode->i_lock);
+ res = __nfs_find_lock_context(ctx);
+ if (res == NULL) {
+ list_add_tail(&new->list, &ctx->lock_context.list);
+ new->open_context = ctx;
+ res = new;
+ new = NULL;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ kfree(new);
+ return res;
+}
+
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+ struct nfs_open_context *ctx = l_ctx->open_context;
+ struct inode *inode = ctx->dentry->d_inode;
+
+ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+ return;
+ list_del(&l_ctx->list);
+ spin_unlock(&inode->i_lock);
+ kfree(l_ctx);
+}
+
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode;
+ struct nfs_server *server;
+
+ if (!(ctx->mode & FMODE_WRITE))
+ return;
+ if (!is_sync)
+ return;
+ inode = ctx->dentry->d_inode;
+ if (!list_empty(&NFS_I(inode)->open_files))
+ return;
+ server = NFS_SERVER(inode);
+ if (server->flags & NFS_MOUNT_NOCTO)
+ return;
+ nfs_revalidate_inode(server, inode);
+}
+
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
+{
+ struct nfs_open_context *ctx;
+ struct rpc_cred *cred = rpc_lookup_cred();
+ if (IS_ERR(cred))
+ return ERR_CAST(cred);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ put_rpccred(cred);
+ return ERR_PTR(-ENOMEM);
+ }
+ nfs_sb_active(dentry->d_sb);
+ ctx->dentry = dget(dentry);
+ ctx->cred = cred;
+ ctx->state = NULL;
+ ctx->mode = f_mode;
+ ctx->flags = 0;
+ ctx->error = 0;
+ nfs_init_lock_context(&ctx->lock_context);
+ ctx->lock_context.open_context = ctx;
+ INIT_LIST_HEAD(&ctx->list);
+ return ctx;
+}
+
+struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+{
+ if (ctx != NULL)
+ atomic_inc(&ctx->lock_context.count);
+ return ctx;
+}
+
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode = ctx->dentry->d_inode;
+ struct super_block *sb = ctx->dentry->d_sb;
+
+ if (!list_empty(&ctx->list)) {
+ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
+ return;
+ list_del(&ctx->list);
+ spin_unlock(&inode->i_lock);
+ } else if (!atomic_dec_and_test(&ctx->lock_context.count))
+ return;
+ if (inode != NULL)
+ NFS_PROTO(inode)->close_context(ctx, is_sync);
+ if (ctx->cred != NULL)
+ put_rpccred(ctx->cred);
+ dput(ctx->dentry);
+ nfs_sb_deactive(sb);
+ kfree(ctx);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 0);
+}
+
+/*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ filp->private_data = get_nfs_open_context(ctx);
+ spin_lock(&inode->i_lock);
+ list_add(&ctx->list, &nfsi->open_files);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Given an inode, search for an open context with the desired characteristics
+ */
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_open_context *pos, *ctx = NULL;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &nfsi->open_files, list) {
+ if (cred != NULL && pos->cred != cred)
+ continue;
+ if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+ continue;
+ ctx = get_nfs_open_context(pos);
+ break;
+ }
+ spin_unlock(&inode->i_lock);
+ return ctx;
+}
+
+static void nfs_file_clear_open_context(struct file *filp)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+
+ if (ctx) {
+ filp->private_data = NULL;
+ spin_lock(&inode->i_lock);
+ list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
+ spin_unlock(&inode->i_lock);
+ __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+ }
+}
+
+/*
+ * These allocate and release file read/write context information.
+ */
+int nfs_open(struct inode *inode, struct file *filp)
+{
+ struct nfs_open_context *ctx;
+
+ ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ nfs_file_set_open_context(filp, ctx);
+ put_nfs_open_context(ctx);
+ nfs_fscache_set_inode_cookie(inode, filp);
+ return 0;
+}
+
+int nfs_release(struct inode *inode, struct file *filp)
+{
+ nfs_file_clear_open_context(filp);
+ return 0;
+}
+
+/*
+ * This function is called whenever some part of NFS notices that
+ * the cached attributes have to be refreshed.
+ */
+int
+__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+ int status = -ESTALE;
+ struct nfs_fattr *fattr = NULL;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
+ inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+
+ if (is_bad_inode(inode))
+ goto out;
+ if (NFS_STALE(inode))
+ goto out;
+
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+
+ nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+ if (status != 0) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode), status);
+ if (status == -ESTALE) {
+ nfs_zap_caches(inode);
+ if (!S_ISDIR(inode->i_mode))
+ set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ }
+ goto out;
+ }
+
+ status = nfs_refresh_inode(inode, fattr);
+ if (status) {
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode), status);
+ goto out;
+ }
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode));
+
+ out:
+ nfs_free_fattr(fattr);
+ return status;
+}
+
+int nfs_attribute_timeout(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
+ if (nfs_have_delegated_attributes(inode))
+ return 0;
+ return nfs_attribute_timeout(inode);
+}
+
+/**
+ * nfs_revalidate_inode - Revalidate the inode attributes
+ * @server - pointer to nfs_server struct
+ * @inode - pointer to inode struct
+ *
+ * Updates inode attribute information by retrieving the data from the server.
+ */
+int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+{
+ if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+ && !nfs_attribute_cache_expired(inode))
+ return NFS_STALE(inode) ? -ESTALE : 0;
+ return __nfs_revalidate_inode(server, inode);
+}
+
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (mapping->nrpages != 0) {
+ int ret = invalidate_inode_pages2(mapping);
+ if (ret < 0)
+ return ret;
+ }
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ if (S_ISDIR(inode->i_mode))
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ spin_unlock(&inode->i_lock);
+ nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+ nfs_fscache_reset_inode_cookie(inode);
+ dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
+ inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+ return 0;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int ret = 0;
+
+ if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ || nfs_attribute_cache_expired(inode)
+ || NFS_STALE(inode)) {
+ ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret < 0)
+ goto out;
+ }
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ ret = nfs_invalidate_mapping(inode, mapping);
+out:
+ return ret;
+}
+
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long ret = 0;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+ && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ && inode->i_version == fattr->pre_change_attr) {
+ inode->i_version = fattr->change_attr;
+ if (S_ISDIR(inode->i_mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ ret |= NFS_INO_INVALID_ATTR;
+ }
+ /* If we have atomic WCC data, we may update some attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+ memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+ ret |= NFS_INO_INVALID_ATTR;
+ }
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+ memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+ if (S_ISDIR(inode->i_mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ ret |= NFS_INO_INVALID_ATTR;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+ && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+ && nfsi->npages == 0) {
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+ ret |= NFS_INO_INVALID_ATTR;
+ }
+ return ret;
+}
+
+/**
+ * nfs_check_inode_attributes - verify consistency of the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Verifies the attribute cache. If we have just changed the attributes,
+ * so that fattr carries weak cache consistency data, then it may
+ * also update the ctime/mtime/change_attribute.
+ */
+static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_size, new_isize;
+ unsigned long invalid = 0;
+
+
+ /* Has the inode gone and changed behind our back? */
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+ return -EIO;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ return -EIO;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ inode->i_version != fattr->change_attr)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize && nfsi->npages == 0)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
+
+ /* Have any file permissions changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+
+ /* Has the link count changed? */
+ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
+ invalid |= NFS_INO_INVALID_ATTR;
+
+ if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
+ invalid |= NFS_INO_INVALID_ATIME;
+
+ if (invalid != 0)
+ nfsi->cache_validity |= invalid;
+
+ nfsi->read_cache_jiffies = fattr->time_start;
+ return 0;
+}
+
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+ return 0;
+ return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ return 0;
+ return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static atomic_long_t nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+ return atomic_long_read(&nfs_attr_generation_counter);
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+ return atomic_long_inc_return(&nfs_attr_generation_counter);
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+ fattr->valid = 0;
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+ fattr->owner_name = NULL;
+ fattr->group_name = NULL;
+}
+
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+ struct nfs_fattr *fattr;
+
+ fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+ if (fattr != NULL)
+ nfs_fattr_init(fattr);
+ return fattr;
+}
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+ struct nfs_fh *fh;
+
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+ if (fh != NULL)
+ fh->size = 0;
+ return fh;
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+ nfs_ctime_need_update(inode, fattr) ||
+ nfs_size_need_update(inode, fattr) ||
+ ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ if (nfs_inode_attrs_need_update(inode, fattr))
+ return nfs_update_inode(inode, fattr);
+ return nfs_check_inode_attributes(inode, fattr);
+}
+
+/**
+ * nfs_refresh_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * Check that an RPC call that returned attributes has not overlapped with
+ * other recent updates of the inode metadata, then decide whether it is
+ * safe to do a full update of the inode attributes, or whether just to
+ * call nfs_check_inode_attributes.
+ */
+int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ spin_lock(&inode->i_lock);
+ status = nfs_refresh_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+
+ return status;
+}
+
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (S_ISDIR(inode->i_mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ return nfs_refresh_inode_locked(inode, fattr);
+}
+
+/**
+ * nfs_post_op_update_inode - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request. Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
+ */
+int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ status = nfs_post_op_update_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ /* Don't do a WCC update if these attributes are already stale */
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+ !nfs_inode_attrs_need_update(inode, fattr)) {
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME);
+ goto out_noforce;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
+ fattr->pre_change_attr = inode->i_version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
+ memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+ fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
+ memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+ fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
+ fattr->pre_size = i_size_read(inode);
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
+ }
+out_noforce:
+ status = nfs_post_op_update_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
+}
+
+/*
+ * Many nfs protocol calls return the new file attributes after
+ * an operation. Here we update the inode to reflect the state
+ * of the server's inode.
+ *
+ * This is a bit tricky because we have to make sure all dirty pages
+ * have been sent off to the server before calling invalidate_inode_pages.
+ * To make sure no other process adds more write requests while we try
+ * our best to flush them, we make them sleep during the attribute refresh.
+ *
+ * A very similar scenario holds for the dir cache.
+ */
+static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_server *server;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ loff_t cur_isize, new_isize;
+ unsigned long invalid = 0;
+ unsigned long now = jiffies;
+ unsigned long save_cache_validity;
+
+ dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
+ __func__, inode->i_sb->s_id, inode->i_ino,
+ atomic_read(&inode->i_count), fattr->valid);
+
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+ goto out_fileid;
+
+ /*
+ * Make sure the inode's type hasn't changed.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ goto out_changed;
+
+ server = NFS_SERVER(inode);
+ /* Update the fsid? */
+ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+ !IS_AUTOMOUNT(inode))
+ server->fsid = fattr->fsid;
+
+ /*
+ * Update the read time so we don't revalidate too often.
+ */
+ nfsi->read_cache_jiffies = fattr->time_start;
+
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_REVAL_PAGECACHE);
+
+ /* Do atomic weak cache consistency updates */
+ invalid |= nfs_wcc_update_inode(inode, fattr);
+
+ /* More cache consistency checks */
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (inode->i_version != fattr->change_attr) {
+ dprintk("NFS: change_attr change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ inode->i_version = fattr->change_attr;
+ }
+ } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+ invalid |= save_cache_validity;
+
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
+ /* NFSv2/v3: Check if the mtime agrees */
+ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
+ dprintk("NFS: mtime change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+ }
+ } else if (server->caps & NFS_CAP_MTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
+ /* If ctime has changed we should definitely clear access+acl caches */
+ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ /* and probably clear data for a directory too as utimes can cause
+ * havoc with our cache.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ }
+ memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+ }
+ } else if (server->caps & NFS_CAP_CTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
+ /* Check if our cached file size is stale */
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ cur_isize = i_size_read(inode);
+ if (new_isize != cur_isize) {
+ /* Do we perhaps have any outstanding writes, or has
+ * the file grown beyond our last write? */
+ if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
+ new_isize > cur_isize) {
+ i_size_write(inode, new_isize);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ }
+ dprintk("NFS: isize change on server for file %s/%ld "
+ "(%Ld to %Ld)\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ (long long)cur_isize,
+ (long long)new_isize);
+ }
+ } else
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+
+
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+ else if (server->caps & NFS_CAP_ATIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+ umode_t newmode = inode->i_mode & S_IFMT;
+ newmode |= fattr->mode & S_IALLUGO;
+ inode->i_mode = newmode;
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ }
+ } else if (server->caps & NFS_CAP_MODE)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+ if (inode->i_uid != fattr->uid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_uid = fattr->uid;
+ }
+ } else if (server->caps & NFS_CAP_OWNER)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+ if (inode->i_gid != fattr->gid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_gid = fattr->gid;
+ }
+ } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+ if (inode->i_nlink != fattr->nlink) {
+ invalid |= NFS_INO_INVALID_ATTR;
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ set_nlink(inode, fattr->nlink);
+ }
+ } else if (server->caps & NFS_CAP_NLINK)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_FORCED);
+
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+ /*
+ * report the blocks in 512byte units
+ */
+ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+
+ /* Update attrtimeo value if we're out of the unstable period */
+ if (invalid & NFS_INO_INVALID_ATTR) {
+ nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
+ nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
+ } else {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ nfsi->attrtimeo_timestamp = now;
+ }
+ }
+ invalid &= ~NFS_INO_INVALID_ATTR;
+ /* Don't invalidate the data if we were to blame */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
+ invalid &= ~NFS_INO_INVALID_DATA;
+ if (!nfs_have_delegation(inode, FMODE_READ) ||
+ (save_cache_validity & NFS_INO_REVAL_FORCED))
+ nfsi->cache_validity |= invalid;
+
+ return 0;
+ out_changed:
+ /*
+ * Big trouble! The inode has become a different object.
+ */
+ printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
+ __func__, inode->i_ino, inode->i_mode, fattr->mode);
+ out_err:
+ /*
+ * No need to worry about unhashing the dentry, as the
+ * lookup validation will know that the inode is bad.
+ * (But we fall through to invalidate the caches.)
+ */
+ nfs_invalidate_inode(inode);
+ return -ESTALE;
+
+ out_fileid:
+ printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+ "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
+ (long long)nfsi->fileid, (long long)fattr->fileid);
+ goto out_err;
+}
+
+
+#ifdef CONFIG_NFS_V4
+
+/*
+ * Clean out any remaining NFSv4 state that might be left over due
+ * to open() calls that passed nfs_atomic_lookup, but failed to call
+ * nfs_open().
+ */
+void nfs4_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages(&inode->i_data, 0);
+ end_writeback(inode);
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout(NFS_I(inode));
+ /* If we are holding a delegation, return it! */
+ nfs_inode_return_delegation_noreclaim(inode);
+ /* First call standard NFS clear_inode() code */
+ nfs_clear_inode(inode);
+}
+#endif
+
+struct inode *nfs_alloc_inode(struct super_block *sb)
+{
+ struct nfs_inode *nfsi;
+ nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ if (!nfsi)
+ return NULL;
+ nfsi->flags = 0UL;
+ nfsi->cache_validity = 0UL;
+#ifdef CONFIG_NFS_V3_ACL
+ nfsi->acl_access = ERR_PTR(-EAGAIN);
+ nfsi->acl_default = ERR_PTR(-EAGAIN);
+#endif
+#ifdef CONFIG_NFS_V4
+ nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
+ return &nfsi->vfs_inode;
+}
+
+static void nfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
+}
+
+void nfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, nfs_i_callback);
+}
+
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+ INIT_LIST_HEAD(&nfsi->open_states);
+ nfsi->delegation = NULL;
+ nfsi->delegation_state = 0;
+ init_rwsem(&nfsi->rwsem);
+ nfsi->layout = NULL;
+ atomic_set(&nfsi->commits_outstanding, 0);
+#endif
+}
+
+static void init_once(void *foo)
+{
+ struct nfs_inode *nfsi = (struct nfs_inode *) foo;
+
+ inode_init_once(&nfsi->vfs_inode);
+ INIT_LIST_HEAD(&nfsi->open_files);
+ INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+ INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+ INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+ nfsi->npages = 0;
+ nfsi->ncommit = 0;
+ atomic_set(&nfsi->silly_count, 1);
+ INIT_HLIST_HEAD(&nfsi->silly_list);
+ init_waitqueue_head(&nfsi->waitqueue);
+ nfs4_init_once(nfsi);
+}
+
+static int __init nfs_init_inodecache(void)
+{
+ nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
+ sizeof(struct nfs_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (nfs_inode_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void nfs_destroy_inodecache(void)
+{
+ kmem_cache_destroy(nfs_inode_cachep);
+}
+
+struct workqueue_struct *nfsiod_workqueue;
+
+/*
+ * start up the nfsiod workqueue
+ */
+static int nfsiod_start(void)
+{
+ struct workqueue_struct *wq;
+ dprintk("RPC: creating workqueue nfsiod\n");
+ wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
+ if (wq == NULL)
+ return -ENOMEM;
+ nfsiod_workqueue = wq;
+ return 0;
+}
+
+/*
+ * Destroy the nfsiod workqueue
+ */
+static void nfsiod_stop(void)
+{
+ struct workqueue_struct *wq;
+
+ wq = nfsiod_workqueue;
+ if (wq == NULL)
+ return;
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+}
+
+/*
+ * Initialize NFS
+ */
+static int __init init_nfs_fs(void)
+{
+ int err;
+
+ err = nfs_idmap_init();
+ if (err < 0)
+ goto out9;
+
+ err = nfs_dns_resolver_init();
+ if (err < 0)
+ goto out8;
+
+ err = nfs_fscache_register();
+ if (err < 0)
+ goto out7;
+
+ err = nfsiod_start();
+ if (err)
+ goto out6;
+
+ err = nfs_fs_proc_init();
+ if (err)
+ goto out5;
+
+ err = nfs_init_nfspagecache();
+ if (err)
+ goto out4;
+
+ err = nfs_init_inodecache();
+ if (err)
+ goto out3;
+
+ err = nfs_init_readpagecache();
+ if (err)
+ goto out2;
+
+ err = nfs_init_writepagecache();
+ if (err)
+ goto out1;
+
+ err = nfs_init_directcache();
+ if (err)
+ goto out0;
+
+#ifdef CONFIG_PROC_FS
+ rpc_proc_register(&nfs_rpcstat);
+#endif
+ if ((err = register_nfs_fs()) != 0)
+ goto out;
+ return 0;
+out:
+#ifdef CONFIG_PROC_FS
+ rpc_proc_unregister("nfs");
+#endif
+ nfs_destroy_directcache();
+out0:
+ nfs_destroy_writepagecache();
+out1:
+ nfs_destroy_readpagecache();
+out2:
+ nfs_destroy_inodecache();
+out3:
+ nfs_destroy_nfspagecache();
+out4:
+ nfs_fs_proc_exit();
+out5:
+ nfsiod_stop();
+out6:
+ nfs_fscache_unregister();
+out7:
+ nfs_dns_resolver_destroy();
+out8:
+ nfs_idmap_quit();
+out9:
+ return err;
+}
+
+static void __exit exit_nfs_fs(void)
+{
+ nfs_destroy_directcache();
+ nfs_destroy_writepagecache();
+ nfs_destroy_readpagecache();
+ nfs_destroy_inodecache();
+ nfs_destroy_nfspagecache();
+ nfs_fscache_unregister();
+ nfs_dns_resolver_destroy();
+ nfs_idmap_quit();
+#ifdef CONFIG_PROC_FS
+ rpc_proc_unregister("nfs");
+#endif
+ nfs_cleanup_cb_ident_idr();
+ unregister_nfs_fs();
+ nfs_fs_proc_exit();
+ nfsiod_stop();
+}
+
+/* Not quite true; I just maintain it */
+MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_LICENSE("GPL");
+module_param(enable_ino64, bool, 0644);
+
+module_init(init_nfs_fs)
+module_exit(exit_nfs_fs)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 00000000000..8102db9b926
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,463 @@
+/*
+ * NFS internal definitions
+ */
+
+#include "nfs4_fs.h"
+#include <linux/mount.h>
+#include <linux/security.h>
+
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
+struct nfs_string;
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ * their needs. People that do NFS over a slow network, might for
+ * instance want to reduce it to something closer to 1 for improved
+ * interactive response.
+ */
+#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
+
+/*
+ * Determine if sessions are in use.
+ */
+static inline int nfs4_has_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+ if (clp->cl_session)
+ return 1;
+#endif /* CONFIG_NFS_V4_1 */
+ return 0;
+}
+
+static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4_1
+ if (nfs4_has_session(clp))
+ return (clp->cl_session->flags & SESSION4_PERSIST);
+#endif /* CONFIG_NFS_V4_1 */
+ return 0;
+}
+
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+ if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+ fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
+
+static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
+{
+ if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) ||
+ (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
+ ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
+ return 0;
+
+ fattr->fileid = fattr->mounted_on_fileid;
+ return 1;
+}
+
+struct nfs_clone_mount {
+ const struct super_block *sb;
+ const struct dentry *dentry;
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
+ char *hostname;
+ char *mnt_path;
+ struct sockaddr *addr;
+ size_t addrlen;
+ rpc_authflavor_t authflavor;
+};
+
+/*
+ * Note: RFC 1813 doesn't limit the number of auth flavors that
+ * a server can return, so make something up.
+ */
+#define NFS_MAX_SECFLAVORS (12)
+
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT (-1)
+
+/*
+ * Maximum number of pages that readdir can use for creating
+ * a vmapped array of pages.
+ */
+#define NFS_MAX_READDIR_PAGES 8
+
+/*
+ * In-kernel mount arguments
+ */
+struct nfs_parsed_mount_data {
+ int flags;
+ int rsize, wsize;
+ int timeo, retrans;
+ int acregmin, acregmax,
+ acdirmin, acdirmax;
+ int namlen;
+ unsigned int options;
+ unsigned int bsize;
+ unsigned int auth_flavor_len;
+ rpc_authflavor_t auth_flavors[1];
+ char *client_address;
+ unsigned int version;
+ unsigned int minorversion;
+ char *fscache_uniq;
+
+ struct {
+ struct sockaddr_storage address;
+ size_t addrlen;
+ char *hostname;
+ u32 version;
+ int port;
+ unsigned short protocol;
+ } mount_server;
+
+ struct {
+ struct sockaddr_storage address;
+ size_t addrlen;
+ char *hostname;
+ char *export_path;
+ int port;
+ unsigned short protocol;
+ } nfs_server;
+
+ struct security_mnt_opts lsm_opts;
+};
+
+/* mount_clnt.c */
+struct nfs_mount_request {
+ struct sockaddr *sap;
+ size_t salen;
+ char *hostname;
+ char *dirpath;
+ u32 version;
+ unsigned short protocol;
+ struct nfs_fh *fh;
+ int noresvport;
+ unsigned int *auth_flav_len;
+ rpc_authflavor_t *auth_flavs;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
+
+/* client.c */
+extern struct rpc_program nfs_program;
+
+extern void nfs_cleanup_cb_ident_idr(void);
+extern void nfs_put_client(struct nfs_client *);
+extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
+extern struct nfs_client *nfs4_find_client_ident(int);
+extern struct nfs_client *
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
+extern struct nfs_server *nfs_create_server(
+ const struct nfs_parsed_mount_data *,
+ struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(
+ const struct nfs_parsed_mount_data *,
+ struct nfs_fh *);
+extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
+ struct nfs_fh *);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+ struct nfs_fh *,
+ struct nfs_fattr *);
+extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen, int ds_proto);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+#else
+static inline int nfs_fs_proc_init(void)
+{
+ return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+
+/* nfs4namespace.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
+{
+ return ERR_PTR(-ENOENT);
+}
+#endif
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+extern struct svc_version nfs4_callback_version4;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
+
+extern int __init nfs_init_directcache(void);
+extern void nfs_destroy_directcache(void);
+
+/* nfs2xdr.c */
+extern int nfs_stat_to_errno(enum nfs_stat);
+extern struct rpc_procinfo nfs_procedures[];
+extern int nfs2_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, int);
+
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern int nfs3_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, int);
+
+/* nfs4xdr.c */
+#ifdef CONFIG_NFS_V4
+extern int nfs4_decode_dirent(struct xdr_stream *,
+ struct nfs_entry *, int);
+#endif
+#ifdef CONFIG_NFS_V4_1
+extern const u32 nfs41_maxread_overhead;
+extern const u32 nfs41_maxwrite_overhead;
+#endif
+
+/* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
+extern struct rpc_procinfo nfs4_procedures[];
+void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *);
+#endif
+
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr, rpc_authflavor_t authflavour,
+ int noresvport);
+
+/* dir.c */
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+/* inode.c */
+extern struct workqueue_struct *nfsiod_workqueue;
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
+extern void nfs_evict_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_evict_inode(struct inode *);
+#endif
+void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
+
+/* super.c */
+extern struct file_system_type nfs_xdev_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type nfs4_xdev_fs_type;
+extern struct file_system_type nfs4_referral_fs_type;
+#endif
+
+extern struct rpc_stat nfs_rpcstat;
+
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
+
+/* namespace.c */
+extern char *nfs_path(char **p, struct dentry *dentry,
+ char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
+#ifdef CONFIG_NFS_V4
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
+#endif
+
+/* getroot.c */
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+ const char *);
+#ifdef CONFIG_NFS_V4
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+ const char *);
+
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
+#endif
+
+struct nfs_pageio_descriptor;
+/* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops);
+extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
+ struct list_head *head);
+
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode);
+extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_readdata_release(struct nfs_read_data *rdata);
+
+/* write.c */
+extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
+ struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags);
+extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_writedata_release(struct nfs_write_data *wdata);
+extern void nfs_commit_free(struct nfs_write_data *p);
+extern int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
+extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
+extern void nfs_init_commit(struct nfs_write_data *data,
+ struct list_head *head,
+ struct pnfs_layout_segment *lseg);
+void nfs_retry_commit(struct list_head *page_list,
+ struct pnfs_layout_segment *lseg);
+void nfs_commit_clear_lock(struct nfs_inode *nfsi);
+void nfs_commitdata_release(void *data);
+void nfs_commit_release_pages(struct nfs_write_data *data);
+
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+ struct page *, struct page *, enum migrate_mode);
+#else
+#define nfs_migrate_page NULL
+#endif
+
+/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+ const struct rpc_timeout *timeparms,
+ const char *ip_addr,
+ rpc_authflavor_t authflavour,
+ int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
+extern int _nfs4_call_sync(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ int cache_reply);
+extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct rpc_message *msg,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ int cache_reply);
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(struct dentry *dentry,
+ char *buffer, ssize_t buflen)
+{
+ char *dummy;
+ return nfs_path(&dummy, dentry, buffer, buflen);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+ /* make sure blocksize is a power of two */
+ if ((bsize & (bsize - 1)) || nrbitsp) {
+ unsigned char nrbits;
+
+ for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+ ;
+ bsize = 1 << nrbits;
+ if (nrbitsp)
+ *nrbitsp = nrbits;
+ }
+
+ return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline blkcnt_t nfs_calc_block_size(u64 tsize)
+{
+ blkcnt_t used = (tsize + 511) >> 9;
+ return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+ if (bsize < NFS_MIN_FILE_IO_SIZE)
+ bsize = NFS_DEF_FILE_IO_SIZE;
+ else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+ bsize = NFS_MAX_FILE_IO_SIZE;
+
+ return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+ sb->s_maxbytes = (loff_t)maxfilesize;
+ if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline
+unsigned int nfs_page_length(struct page *page)
+{
+ loff_t i_size = i_size_read(page->mapping->host);
+
+ if (i_size > 0) {
+ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ if (page->index < end_index)
+ return PAGE_CACHE_SIZE;
+ if (page->index == end_index)
+ return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+ return ((unsigned long)len + (unsigned long)base +
+ PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
new file mode 100644
index 00000000000..c5832487c45
--- /dev/null
+++ b/fs/nfs/iostat.h
@@ -0,0 +1,71 @@
+/*
+ * linux/fs/nfs/iostat.h
+ *
+ * Declarations for NFS client per-mount statistics
+ *
+ * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
+
+struct nfs_iostats {
+ unsigned long long bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+ unsigned long long fscache[__NFSIOS_FSCACHEMAX];
+#endif
+ unsigned long events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
+{
+ this_cpu_inc(server->io_stats->events[stat]);
+}
+
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
+{
+ nfs_inc_server_stats(NFS_SERVER(inode), stat);
+}
+
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ this_cpu_add(server->io_stats->bytes[stat], addend);
+}
+
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ long addend)
+{
+ nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
+}
+
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+ enum nfs_stat_fscachecounters stat,
+ long addend)
+{
+ this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
+}
+#endif
+
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
+{
+ return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
+{
+ if (stats != NULL)
+ free_percpu(stats);
+}
+
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
new file mode 100644
index 00000000000..d4c2d6b7507
--- /dev/null
+++ b/fs/nfs/mount_clnt.c
@@ -0,0 +1,518 @@
+/*
+ * In-kernel MOUNT protocol client
+ *
+ * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#ifdef RPC_DEBUG
+# define NFSDBG_FACILITY NFSDBG_MOUNT
+#endif
+
+/*
+ * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
+ */
+#define MNTPATHLEN (1024)
+
+/*
+ * XDR data type sizes
+ */
+#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
+#define MNT_status_sz (1)
+#define MNT_fhs_status_sz (1)
+#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
+#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
+
+/*
+ * XDR argument and result sizes
+ */
+#define MNT_enc_dirpath_sz encode_dirpath_sz
+#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
+#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \
+ MNT_authflav3_sz)
+
+/*
+ * Defined by RFC 1094, section A.5
+ */
+enum {
+ MOUNTPROC_NULL = 0,
+ MOUNTPROC_MNT = 1,
+ MOUNTPROC_DUMP = 2,
+ MOUNTPROC_UMNT = 3,
+ MOUNTPROC_UMNTALL = 4,
+ MOUNTPROC_EXPORT = 5,
+};
+
+/*
+ * Defined by RFC 1813, section 5.2
+ */
+enum {
+ MOUNTPROC3_NULL = 0,
+ MOUNTPROC3_MNT = 1,
+ MOUNTPROC3_DUMP = 2,
+ MOUNTPROC3_UMNT = 3,
+ MOUNTPROC3_UMNTALL = 4,
+ MOUNTPROC3_EXPORT = 5,
+};
+
+static struct rpc_program mnt_program;
+
+/*
+ * Defined by OpenGroup XNFS Version 3W, chapter 8
+ */
+enum mountstat {
+ MNT_OK = 0,
+ MNT_EPERM = 1,
+ MNT_ENOENT = 2,
+ MNT_EACCES = 13,
+ MNT_EINVAL = 22,
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt_errtbl[] = {
+ { .status = MNT_OK, .errno = 0, },
+ { .status = MNT_EPERM, .errno = -EPERM, },
+ { .status = MNT_ENOENT, .errno = -ENOENT, },
+ { .status = MNT_EACCES, .errno = -EACCES, },
+ { .status = MNT_EINVAL, .errno = -EINVAL, },
+};
+
+/*
+ * Defined by RFC 1813, section 5.1.5
+ */
+enum mountstat3 {
+ MNT3_OK = 0, /* no error */
+ MNT3ERR_PERM = 1, /* Not owner */
+ MNT3ERR_NOENT = 2, /* No such file or directory */
+ MNT3ERR_IO = 5, /* I/O error */
+ MNT3ERR_ACCES = 13, /* Permission denied */
+ MNT3ERR_NOTDIR = 20, /* Not a directory */
+ MNT3ERR_INVAL = 22, /* Invalid argument */
+ MNT3ERR_NAMETOOLONG = 63, /* Filename too long */
+ MNT3ERR_NOTSUPP = 10004, /* Operation not supported */
+ MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */
+};
+
+static struct {
+ u32 status;
+ int errno;
+} mnt3_errtbl[] = {
+ { .status = MNT3_OK, .errno = 0, },
+ { .status = MNT3ERR_PERM, .errno = -EPERM, },
+ { .status = MNT3ERR_NOENT, .errno = -ENOENT, },
+ { .status = MNT3ERR_IO, .errno = -EIO, },
+ { .status = MNT3ERR_ACCES, .errno = -EACCES, },
+ { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, },
+ { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
+ { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
+ { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
+ { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
+};
+
+struct mountres {
+ int errno;
+ struct nfs_fh *fh;
+ unsigned int *auth_count;
+ rpc_authflavor_t *auth_flavors;
+};
+
+struct mnt_fhstatus {
+ u32 status;
+ struct nfs_fh *fh;
+};
+
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @info: pointer to mount request arguments
+ *
+ * Uses default timeout parameters specified by underlying transport.
+ */
+int nfs_mount(struct nfs_mount_request *info)
+{
+ struct mountres result = {
+ .fh = info->fh,
+ .auth_count = info->auth_flav_len,
+ .auth_flavors = info->auth_flavs,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ .rpc_resp = &result,
+ };
+ struct rpc_create_args args = {
+ .net = &init_net,
+ .protocol = info->protocol,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ };
+ struct rpc_clnt *mnt_clnt;
+ int status;
+
+ dprintk("NFS: sending MNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"),
+ info->dirpath);
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ mnt_clnt = rpc_create(&args);
+ if (IS_ERR(mnt_clnt))
+ goto out_clnt_err;
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+ else
+ msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
+
+ status = rpc_call_sync(mnt_clnt, &msg, 0);
+ rpc_shutdown_client(mnt_clnt);
+
+ if (status < 0)
+ goto out_call_err;
+ if (result.errno != 0)
+ goto out_mnt_err;
+
+ dprintk("NFS: MNT request succeeded\n");
+ status = 0;
+
+out:
+ return status;
+
+out_clnt_err:
+ status = PTR_ERR(mnt_clnt);
+ dprintk("NFS: failed to create MNT RPC client, status=%d\n", status);
+ goto out;
+
+out_call_err:
+ dprintk("NFS: MNT request failed, status=%d\n", status);
+ goto out;
+
+out_mnt_err:
+ dprintk("NFS: MNT server returned result %d\n", result.errno);
+ status = result.errno;
+ goto out;
+}
+
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+ static const struct rpc_timeout nfs_umnt_timeout = {
+ .to_initval = 1 * HZ,
+ .to_maxval = 3 * HZ,
+ .to_retries = 2,
+ };
+ struct rpc_create_args args = {
+ .net = &init_net,
+ .protocol = IPPROTO_UDP,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .timeout = &nfs_umnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ };
+ struct rpc_clnt *clnt;
+ int status;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt))
+ goto out_clnt_err;
+
+ dprintk("NFS: sending UMNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"), info->dirpath);
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+ else
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+ status = rpc_call_sync(clnt, &msg, 0);
+ rpc_shutdown_client(clnt);
+
+ if (unlikely(status < 0))
+ goto out_call_err;
+
+ return;
+
+out_clnt_err:
+ dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+ PTR_ERR(clnt));
+ return;
+
+out_call_err:
+ dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
+/*
+ * XDR encode/decode functions for MOUNT
+ */
+
+static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
+{
+ const u32 pathname_len = strlen(pathname);
+ __be32 *p;
+
+ BUG_ON(pathname_len > MNTPATHLEN);
+ p = xdr_reserve_space(xdr, 4 + pathname_len);
+ xdr_encode_opaque(p, pathname, pathname_len);
+}
+
+static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const char *dirpath)
+{
+ encode_mntdirpath(xdr, dirpath);
+}
+
+/*
+ * RFC 1094: "A non-zero status indicates some sort of error. In this
+ * case, the status is a UNIX error number." This can be problematic
+ * if the server and client use different errno values for the same
+ * error.
+ *
+ * However, the OpenGroup XNFS spec provides a simple mapping that is
+ * independent of local errno values on the server and the client.
+ */
+static int decode_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
+ if (mnt_errtbl[i].status == status) {
+ res->errno = mnt_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct mountres *res)
+{
+ int status;
+
+ status = decode_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ return decode_fhandle(xdr, res);
+}
+
+static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
+{
+ unsigned int i;
+ u32 status;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ status = be32_to_cpup(p);
+
+ for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
+ if (mnt3_errtbl[i].status == status) {
+ res->errno = mnt3_errtbl[i].errno;
+ return 0;
+ }
+ }
+
+ dprintk("NFS: unrecognized MNT3 status code: %u\n", status);
+ res->errno = -EACCES;
+ return 0;
+}
+
+static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)
+{
+ struct nfs_fh *fh = res->fh;
+ u32 size;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ size = be32_to_cpup(p);
+ if (size > NFS3_FHSIZE || size == 0)
+ return -EIO;
+
+ p = xdr_inline_decode(xdr, size);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ fh->size = size;
+ memcpy(fh->data, p, size);
+ return 0;
+}
+
+static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)
+{
+ rpc_authflavor_t *flavors = res->auth_flavors;
+ unsigned int *count = res->auth_count;
+ u32 entries, i;
+ __be32 *p;
+
+ if (*count == 0)
+ return 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+ entries = be32_to_cpup(p);
+ dprintk("NFS: received %u auth flavors\n", entries);
+ if (entries > NFS_MAX_SECFLAVORS)
+ entries = NFS_MAX_SECFLAVORS;
+
+ p = xdr_inline_decode(xdr, 4 * entries);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ if (entries > *count)
+ entries = *count;
+
+ for (i = 0; i < entries; i++) {
+ flavors[i] = be32_to_cpup(p++);
+ dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]);
+ }
+ *count = i;
+
+ return 0;
+}
+
+static int mnt_xdr_dec_mountres3(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct mountres *res)
+{
+ int status;
+
+ status = decode_fhs_status(xdr, res);
+ if (unlikely(status != 0 || res->errno != 0))
+ return status;
+ status = decode_fhandle3(xdr, res);
+ if (unlikely(status != 0)) {
+ res->errno = -EBADHANDLE;
+ return 0;
+ }
+ return decode_auth_flavors(xdr, res);
+}
+
+static struct rpc_procinfo mnt_procedures[] = {
+ [MOUNTPROC_MNT] = {
+ .p_proc = MOUNTPROC_MNT,
+ .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
+ .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres_sz,
+ .p_statidx = MOUNTPROC_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC_UMNT] = {
+ .p_proc = MOUNTPROC_UMNT,
+ .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+static struct rpc_procinfo mnt3_procedures[] = {
+ [MOUNTPROC3_MNT] = {
+ .p_proc = MOUNTPROC3_MNT,
+ .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
+ .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_replen = MNT_dec_mountres3_sz,
+ .p_statidx = MOUNTPROC3_MNT,
+ .p_name = "MOUNT",
+ },
+ [MOUNTPROC3_UMNT] = {
+ .p_proc = MOUNTPROC3_UMNT,
+ .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC3_UMNT,
+ .p_name = "UMOUNT",
+ },
+};
+
+
+static struct rpc_version mnt_version1 = {
+ .number = 1,
+ .nrprocs = ARRAY_SIZE(mnt_procedures),
+ .procs = mnt_procedures,
+};
+
+static struct rpc_version mnt_version3 = {
+ .number = 3,
+ .nrprocs = ARRAY_SIZE(mnt3_procedures),
+ .procs = mnt3_procedures,
+};
+
+static struct rpc_version *mnt_version[] = {
+ NULL,
+ &mnt_version1,
+ NULL,
+ &mnt_version3,
+};
+
+static struct rpc_stat mnt_stats;
+
+static struct rpc_program mnt_program = {
+ .name = "mount",
+ .number = NFS_MNT_PROGRAM,
+ .nrvers = ARRAY_SIZE(mnt_version),
+ .version = mnt_version,
+ .stats = &mnt_stats,
+};
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 00000000000..8102391bb37
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,371 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/dcache.h>
+#include <linux/gfp.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+static void nfs_expire_automounts(struct work_struct *work);
+
+static LIST_HEAD(nfs_automount_list);
+static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t authflavor);
+
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - used to return pointer to the end of devname part of path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
+ */
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+ char *end;
+ int namelen;
+ unsigned seq;
+ const char *base;
+
+rename_retry:
+ end = buffer+buflen;
+ *--end = '\0';
+ buflen--;
+
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
+ while (1) {
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry))
+ break;
+ namelen = dentry->d_name.len;
+ buflen -= namelen + 1;
+ if (buflen < 0)
+ goto Elong_unlock;
+ end -= namelen;
+ memcpy(end, dentry->d_name.name, namelen);
+ *--end = '/';
+ spin_unlock(&dentry->d_lock);
+ dentry = dentry->d_parent;
+ }
+ if (read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ if (*end != '/') {
+ if (--buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ *--end = '/';
+ }
+ *p = end;
+ base = dentry->d_fsdata;
+ if (!base) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ WARN_ON(1);
+ return end;
+ }
+ namelen = strlen(base);
+ /* Strip off excess slashes in base string */
+ while (namelen > 0 && base[namelen - 1] == '/')
+ namelen--;
+ buflen -= namelen;
+ if (buflen < 0) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ goto Elong;
+ }
+ end -= namelen;
+ memcpy(end, base, namelen);
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ return end;
+Elong_unlock:
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
+Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+}
+
+#ifdef CONFIG_NFS_V4
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+{
+ struct gss_api_mech *mech;
+ struct xdr_netobj oid;
+ int i;
+ rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+
+ for (i = 0; i < flavors->num_flavors; i++) {
+ struct nfs4_secinfo_flavor *flavor;
+ flavor = &flavors->flavors[i];
+
+ if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
+ pseudoflavor = flavor->flavor;
+ break;
+ } else if (flavor->flavor == RPC_AUTH_GSS) {
+ oid.len = flavor->gss.sec_oid4.len;
+ oid.data = flavor->gss.sec_oid4.data;
+ mech = gss_mech_get_by_OID(&oid);
+ if (!mech)
+ continue;
+ pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+ gss_mech_put(mech);
+ break;
+ }
+ }
+
+ return pseudoflavor;
+}
+
+static int nfs_negotiate_security(const struct dentry *parent,
+ const struct dentry *dentry,
+ rpc_authflavor_t *flavor)
+{
+ struct page *page;
+ struct nfs4_secinfo_flavors *flavors;
+ int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+ int ret = -EPERM;
+
+ secinfo = NFS_PROTO(parent->d_inode)->secinfo;
+ if (secinfo != NULL) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ flavors = page_address(page);
+ ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
+ *flavor = nfs_find_best_sec(flavors);
+ put_page(page);
+ }
+
+out:
+ return ret;
+}
+
+static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
+ struct dentry *dentry, struct path *path,
+ struct nfs_fh *fh, struct nfs_fattr *fattr,
+ rpc_authflavor_t *flavor)
+{
+ struct rpc_clnt *clone;
+ struct rpc_auth *auth;
+ int err;
+
+ err = nfs_negotiate_security(parent, path->dentry, flavor);
+ if (err < 0)
+ goto out;
+ clone = rpc_clone_client(server->client);
+ auth = rpcauth_create(*flavor, clone);
+ if (!auth) {
+ err = -EIO;
+ goto out_shutdown;
+ }
+ err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
+ &path->dentry->d_name,
+ fh, fattr);
+out_shutdown:
+ rpc_shutdown_client(clone);
+out:
+ return err;
+}
+#else /* CONFIG_NFS_V4 */
+static inline int nfs_lookup_with_sec(struct nfs_server *server,
+ struct dentry *parent, struct dentry *dentry,
+ struct path *path, struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t *flavor)
+{
+ return -EPERM;
+}
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+struct vfsmount *nfs_d_automount(struct path *path)
+{
+ struct vfsmount *mnt;
+ struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
+ struct dentry *parent;
+ struct nfs_fh *fh = NULL;
+ struct nfs_fattr *fattr = NULL;
+ int err;
+ rpc_authflavor_t flavor = RPC_AUTH_UNIX;
+
+ dprintk("--> nfs_d_automount()\n");
+
+ mnt = ERR_PTR(-ESTALE);
+ if (IS_ROOT(path->dentry))
+ goto out_nofree;
+
+ mnt = ERR_PTR(-ENOMEM);
+ fh = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fh == NULL || fattr == NULL)
+ goto out;
+
+ dprintk("%s: enter\n", __func__);
+
+ /* Look it up again to get its attributes */
+ parent = dget_parent(path->dentry);
+ err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
+ &path->dentry->d_name,
+ fh, fattr);
+ if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL)
+ err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor);
+ dput(parent);
+ if (err != 0) {
+ mnt = ERR_PTR(err);
+ goto out;
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ mnt = nfs_do_refmount(path->dentry);
+ else
+ mnt = nfs_do_submount(path->dentry, fh, fattr, flavor);
+ if (IS_ERR(mnt))
+ goto out;
+
+ dprintk("%s: done, success\n", __func__);
+ mntget(mnt); /* prevent immediate expiration */
+ mnt_set_expiry(mnt, &nfs_automount_list);
+ schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fh);
+out_nofree:
+ dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+ return mnt;
+}
+
+const struct inode_operations nfs_mountpoint_inode_operations = {
+ .getattr = nfs_getattr,
+};
+
+const struct inode_operations nfs_referral_inode_operations = {
+};
+
+static void nfs_expire_automounts(struct work_struct *work)
+{
+ struct list_head *list = &nfs_automount_list;
+
+ mark_mounts_for_expiry(list);
+ if (!list_empty(list))
+ schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+ if (list_empty(&nfs_automount_list))
+ cancel_delayed_work(&nfs_automount_task);
+}
+
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
+ const char *devname,
+ struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+ struct vfsmount *mnt = ERR_PTR(-EINVAL);
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ case 3:
+ mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+ break;
+ case 4:
+ mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata);
+ }
+ return mnt;
+#else
+ return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+#endif
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ * @authflavor - security flavor to use when performing the mount
+ *
+ */
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
+ struct nfs_fh *fh,
+ struct nfs_fattr *fattr,
+ rpc_authflavor_t authflavor)
+{
+ struct nfs_clone_mount mountdata = {
+ .sb = dentry->d_sb,
+ .dentry = dentry,
+ .fh = fh,
+ .fattr = fattr,
+ .authflavor = authflavor,
+ };
+ struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+ char *page = (char *) __get_free_page(GFP_USER);
+ char *devname;
+
+ dprintk("--> nfs_do_submount()\n");
+
+ dprintk("%s: submounting on %s/%s\n", __func__,
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+ if (page == NULL)
+ goto out;
+ devname = nfs_devname(dentry, page, PAGE_SIZE);
+ mnt = (struct vfsmount *)devname;
+ if (IS_ERR(devname))
+ goto free_page;
+ mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
+free_page:
+ free_page((unsigned long)page);
+out:
+ dprintk("%s: done\n", __func__);
+
+ dprintk("<-- nfs_do_submount() = %p\n", mnt);
+ return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
new file mode 100644
index 00000000000..792cb13a430
--- /dev/null
+++ b/fs/nfs/nfs2xdr.c
@@ -0,0 +1,1157 @@
+/*
+ * linux/fs/nfs/nfs2xdr.c
+ *
+ * XDR functions to encode/decode NFS RPC arguments and results.
+ *
+ * Copyright (C) 1992, 1993, 1994 Rick Sladkey
+ * Copyright (C) 1996 Olaf Kirch
+ * 04 Aug 1998 Ion Badulescu <ionut@cs.columbia.edu>
+ * FIFO's need special handling in NFSv2
+ */
+
+#include <linux/param.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs_fs.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_XDR
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+/*
+ * Declare the space requirements for NFS arguments and replies as
+ * number of 32bit-words
+ */
+#define NFS_fhandle_sz (8)
+#define NFS_sattr_sz (8)
+#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2))
+#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2))
+#define NFS_fattr_sz (17)
+#define NFS_info_sz (5)
+#define NFS_entry_sz (NFS_filename_sz+3)
+
+#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz)
+#define NFS_readlinkargs_sz (NFS_fhandle_sz)
+#define NFS_readargs_sz (NFS_fhandle_sz+3)
+#define NFS_writeargs_sz (NFS_fhandle_sz+4)
+#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz)
+#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz)
+#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz)
+#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz)
+#define NFS_readdirargs_sz (NFS_fhandle_sz+2)
+
+#define NFS_attrstat_sz (1+NFS_fattr_sz)
+#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz)
+#define NFS_readlinkres_sz (2)
+#define NFS_readres_sz (1+NFS_fattr_sz+1)
+#define NFS_writeres_sz (NFS_attrstat_sz)
+#define NFS_stat_sz (1)
+#define NFS_readdirres_sz (1)
+#define NFS_statfsres_sz (1+NFS_info_sz)
+
+
+/*
+ * While encoding arguments, set up the reply buffer in advance to
+ * receive reply data directly into the page cache.
+ */
+static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages,
+ unsigned int base, unsigned int len,
+ unsigned int bufsize)
+{
+ struct rpc_auth *auth = req->rq_cred->cr_auth;
+ unsigned int replen;
+
+ replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize;
+ xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);
+}
+
+/*
+ * Handle decode buffer overflows out-of-line.
+ */
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+ dprintk("NFS: %s prematurely hit the end of our receive buffer. "
+ "Remaining buffer length is %tu words.\n",
+ func, xdr->end - xdr->p);
+}
+
+
+/*
+ * Encode/decode NFSv2 basic data types
+ *
+ * Basic NFSv2 data types are defined in section 2.3 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ *
+ * Not all basic data types have their own encoding and decoding
+ * functions. For run-time efficiency, some data types are encoded
+ * or decoded inline.
+ */
+
+/*
+ * typedef opaque nfsdata<>;
+ */
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+{
+ u32 recvd, count;
+ size_t hdrlen;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ count = be32_to_cpup(p);
+ hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+ recvd = xdr->buf->len - hdrlen;
+ if (unlikely(count > recvd))
+ goto out_cheating;
+out:
+ xdr_read_pages(xdr, count);
+ result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */
+ result->count = count;
+ return count;
+out_cheating:
+ dprintk("NFS: server cheating in read result: "
+ "count %u > recvd %u\n", count, recvd);
+ count = recvd;
+ goto out;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * enum stat {
+ * NFS_OK = 0,
+ * NFSERR_PERM = 1,
+ * NFSERR_NOENT = 2,
+ * NFSERR_IO = 5,
+ * NFSERR_NXIO = 6,
+ * NFSERR_ACCES = 13,
+ * NFSERR_EXIST = 17,
+ * NFSERR_NODEV = 19,
+ * NFSERR_NOTDIR = 20,
+ * NFSERR_ISDIR = 21,
+ * NFSERR_FBIG = 27,
+ * NFSERR_NOSPC = 28,
+ * NFSERR_ROFS = 30,
+ * NFSERR_NAMETOOLONG = 63,
+ * NFSERR_NOTEMPTY = 66,
+ * NFSERR_DQUOT = 69,
+ * NFSERR_STALE = 70,
+ * NFSERR_WFLUSH = 99
+ * };
+ */
+static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ *status = be32_to_cpup(p);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * 2.3.2. ftype
+ *
+ * enum ftype {
+ * NFNON = 0,
+ * NFREG = 1,
+ * NFDIR = 2,
+ * NFBLK = 3,
+ * NFCHR = 4,
+ * NFLNK = 5
+ * };
+ *
+ */
+static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)
+{
+ *type = be32_to_cpup(p++);
+ if (unlikely(*type > NF2FIFO))
+ *type = NFBAD;
+ return p;
+}
+
+/*
+ * 2.3.3. fhandle
+ *
+ * typedef opaque fhandle[FHSIZE];
+ */
+static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ BUG_ON(fh->size != NFS2_FHSIZE);
+ p = xdr_reserve_space(xdr, NFS2_FHSIZE);
+ memcpy(p, fh->data, NFS2_FHSIZE);
+}
+
+static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS2_FHSIZE);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ fh->size = NFS2_FHSIZE;
+ memcpy(fh->data, p, NFS2_FHSIZE);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * 2.3.4. timeval
+ *
+ * struct timeval {
+ * unsigned int seconds;
+ * unsigned int useconds;
+ * };
+ */
+static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)
+{
+ *p++ = cpu_to_be32(timep->tv_sec);
+ if (timep->tv_nsec != 0)
+ *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC);
+ else
+ *p++ = cpu_to_be32(0);
+ return p;
+}
+
+/*
+ * Passing the invalid value useconds=1000000 is a Sun convention for
+ * "set to current server time". It's needed to make permissions checks
+ * for the "touch" program across v2 mounts to Solaris and Irix servers
+ * work correctly. See description of sattr in section 6.1 of "NFS
+ * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.
+ */
+static __be32 *xdr_encode_current_server_time(__be32 *p,
+ const struct timespec *timep)
+{
+ *p++ = cpu_to_be32(timep->tv_sec);
+ *p++ = cpu_to_be32(1000000);
+ return p;
+}
+
+static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)
+{
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC;
+ return p;
+}
+
+/*
+ * 2.3.5. fattr
+ *
+ * struct fattr {
+ * ftype type;
+ * unsigned int mode;
+ * unsigned int nlink;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * unsigned int blocksize;
+ * unsigned int rdev;
+ * unsigned int blocks;
+ * unsigned int fsid;
+ * unsigned int fileid;
+ * timeval atime;
+ * timeval mtime;
+ * timeval ctime;
+ * };
+ *
+ */
+static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
+{
+ u32 rdev, type;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_fattr_sz << 2);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+
+ fattr->valid |= NFS_ATTR_FATTR_V2;
+
+ p = xdr_decode_ftype(p, &type);
+
+ fattr->mode = be32_to_cpup(p++);
+ fattr->nlink = be32_to_cpup(p++);
+ fattr->uid = be32_to_cpup(p++);
+ fattr->gid = be32_to_cpup(p++);
+ fattr->size = be32_to_cpup(p++);
+ fattr->du.nfs2.blocksize = be32_to_cpup(p++);
+
+ rdev = be32_to_cpup(p++);
+ fattr->rdev = new_decode_dev(rdev);
+ if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) {
+ fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
+ fattr->rdev = 0;
+ }
+
+ fattr->du.nfs2.blocks = be32_to_cpup(p++);
+ fattr->fsid.major = be32_to_cpup(p++);
+ fattr->fsid.minor = 0;
+ fattr->fileid = be32_to_cpup(p++);
+
+ p = xdr_decode_time(p, &fattr->atime);
+ p = xdr_decode_time(p, &fattr->mtime);
+ xdr_decode_time(p, &fattr->ctime);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * 2.3.6. sattr
+ *
+ * struct sattr {
+ * unsigned int mode;
+ * unsigned int uid;
+ * unsigned int gid;
+ * unsigned int size;
+ * timeval atime;
+ * timeval mtime;
+ * };
+ */
+
+#define NFS2_SATTR_NOT_SET (0xffffffff)
+
+static __be32 *xdr_time_not_set(__be32 *p)
+{
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ return p;
+}
+
+static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS_sattr_sz << 2);
+
+ if (attr->ia_valid & ATTR_MODE)
+ *p++ = cpu_to_be32(attr->ia_mode);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_UID)
+ *p++ = cpu_to_be32(attr->ia_uid);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_GID)
+ *p++ = cpu_to_be32(attr->ia_gid);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+ if (attr->ia_valid & ATTR_SIZE)
+ *p++ = cpu_to_be32((u32)attr->ia_size);
+ else
+ *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
+
+ if (attr->ia_valid & ATTR_ATIME_SET)
+ p = xdr_encode_time(p, &attr->ia_atime);
+ else if (attr->ia_valid & ATTR_ATIME)
+ p = xdr_encode_current_server_time(p, &attr->ia_atime);
+ else
+ p = xdr_time_not_set(p);
+ if (attr->ia_valid & ATTR_MTIME_SET)
+ xdr_encode_time(p, &attr->ia_mtime);
+ else if (attr->ia_valid & ATTR_MTIME)
+ xdr_encode_current_server_time(p, &attr->ia_mtime);
+ else
+ xdr_time_not_set(p);
+}
+
+/*
+ * 2.3.7. filename
+ *
+ * typedef string filename<MAXNAMLEN>;
+ */
+static void encode_filename(struct xdr_stream *xdr,
+ const char *name, u32 length)
+{
+ __be32 *p;
+
+ BUG_ON(length > NFS2_MAXNAMLEN);
+ p = xdr_reserve_space(xdr, 4 + length);
+ xdr_encode_opaque(p, name, length);
+}
+
+static int decode_filename_inline(struct xdr_stream *xdr,
+ const char **name, u32 *length)
+{
+ __be32 *p;
+ u32 count;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ count = be32_to_cpup(p);
+ if (count > NFS3_MAXNAMLEN)
+ goto out_nametoolong;
+ p = xdr_inline_decode(xdr, count);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ *name = (const char *)p;
+ *length = count;
+ return 0;
+out_nametoolong:
+ dprintk("NFS: returned filename too long: %u\n", count);
+ return -ENAMETOOLONG;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * 2.3.8. path
+ *
+ * typedef string path<MAXPATHLEN>;
+ */
+static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
+{
+ __be32 *p;
+
+ BUG_ON(length > NFS2_MAXPATHLEN);
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(length);
+ xdr_write_pages(xdr, pages, 0, length);
+}
+
+static int decode_path(struct xdr_stream *xdr)
+{
+ u32 length, recvd;
+ size_t hdrlen;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ length = be32_to_cpup(p);
+ if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN))
+ goto out_size;
+ hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+ recvd = xdr->buf->len - hdrlen;
+ if (unlikely(length > recvd))
+ goto out_cheating;
+
+ xdr_read_pages(xdr, length);
+ xdr_terminate_string(xdr->buf, length);
+ return 0;
+out_size:
+ dprintk("NFS: returned pathname too long: %u\n", length);
+ return -ENAMETOOLONG;
+out_cheating:
+ dprintk("NFS: server cheating in pathname result: "
+ "length %u > received %u\n", length, recvd);
+ return -EIO;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+/*
+ * 2.3.9. attrstat
+ *
+ * union attrstat switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * default:
+ * void;
+ * };
+ */
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.3.10. diropargs
+ *
+ * struct diropargs {
+ * fhandle dir;
+ * filename name;
+ * };
+ */
+static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh,
+ const char *name, u32 length)
+{
+ encode_fhandle(xdr, fh);
+ encode_filename(xdr, name, length);
+}
+
+/*
+ * 2.3.11. diropres
+ *
+ * union diropres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * fhandle file;
+ * fattr attributes;
+ * } diropok;
+ * default:
+ * void;
+ * };
+ */
+static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+ int error;
+
+ error = decode_fhandle(xdr, result->fh);
+ if (unlikely(error))
+ goto out;
+ error = decode_fattr(xdr, result->fattr);
+out:
+ return error;
+}
+
+static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_diropok(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * NFSv2 XDR encode functions
+ *
+ * NFSv2 argument types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_fh *fh)
+{
+ encode_fhandle(xdr, fh);
+}
+
+/*
+ * 2.2.3. sattrargs
+ *
+ * struct sattrargs {
+ * fhandle file;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_sattrargs *args)
+{
+ encode_fhandle(xdr, args->fh);
+ encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_diropargs *args)
+{
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+}
+
+static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_readlinkargs *args)
+{
+ encode_fhandle(xdr, args->fh);
+ prepare_reply_buffer(req, args->pages, args->pgbase,
+ args->pglen, NFS_readlinkres_sz);
+}
+
+/*
+ * 2.2.7. readargs
+ *
+ * struct readargs {
+ * fhandle file;
+ * unsigned offset;
+ * unsigned count;
+ * unsigned totalcount;
+ * };
+ */
+static void encode_readargs(struct xdr_stream *xdr,
+ const struct nfs_readargs *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+ *p = cpu_to_be32(count);
+}
+
+static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_readargs *args)
+{
+ encode_readargs(xdr, args);
+ prepare_reply_buffer(req, args->pages, args->pgbase,
+ args->count, NFS_readres_sz);
+ req->rq_rcv_buf.flags |= XDRBUF_READ;
+}
+
+/*
+ * 2.2.9. writeargs
+ *
+ * struct writeargs {
+ * fhandle file;
+ * unsigned beginoffset;
+ * unsigned offset;
+ * unsigned totalcount;
+ * nfsdata data;
+ * };
+ */
+static void encode_writeargs(struct xdr_stream *xdr,
+ const struct nfs_writeargs *args)
+{
+ u32 offset = args->offset;
+ u32 count = args->count;
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(offset);
+ *p++ = cpu_to_be32(count);
+
+ /* nfsdata */
+ *p = cpu_to_be32(count);
+ xdr_write_pages(xdr, args->pages, args->pgbase, count);
+}
+
+static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_writeargs *args)
+{
+ encode_writeargs(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
+}
+
+/*
+ * 2.2.10. createargs
+ *
+ * struct createargs {
+ * diropargs where;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_createargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_createargs *args)
+{
+ encode_diropargs(xdr, args->fh, args->name, args->len);
+ encode_sattr(xdr, args->sattr);
+}
+
+static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_removeargs *args)
+{
+ encode_diropargs(xdr, args->fh, args->name.name, args->name.len);
+}
+
+/*
+ * 2.2.12. renameargs
+ *
+ * struct renameargs {
+ * diropargs from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_renameargs *args)
+{
+ const struct qstr *old = args->old_name;
+ const struct qstr *new = args->new_name;
+
+ encode_diropargs(xdr, args->old_dir, old->name, old->len);
+ encode_diropargs(xdr, args->new_dir, new->name, new->len);
+}
+
+/*
+ * 2.2.13. linkargs
+ *
+ * struct linkargs {
+ * fhandle from;
+ * diropargs to;
+ * };
+ */
+static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_linkargs *args)
+{
+ encode_fhandle(xdr, args->fromfh);
+ encode_diropargs(xdr, args->tofh, args->toname, args->tolen);
+}
+
+/*
+ * 2.2.14. symlinkargs
+ *
+ * struct symlinkargs {
+ * diropargs from;
+ * path to;
+ * sattr attributes;
+ * };
+ */
+static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_symlinkargs *args)
+{
+ encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen);
+ encode_path(xdr, args->pages, args->pathlen);
+ encode_sattr(xdr, args->sattr);
+}
+
+/*
+ * 2.2.17. readdirargs
+ *
+ * struct readdirargs {
+ * fhandle dir;
+ * nfscookie cookie;
+ * unsigned count;
+ * };
+ */
+static void encode_readdirargs(struct xdr_stream *xdr,
+ const struct nfs_readdirargs *args)
+{
+ __be32 *p;
+
+ encode_fhandle(xdr, args->fh);
+
+ p = xdr_reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(args->cookie);
+ *p = cpu_to_be32(args->count);
+}
+
+static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfs_readdirargs *args)
+{
+ encode_readdirargs(xdr, args);
+ prepare_reply_buffer(req, args->pages, 0,
+ args->count, NFS_readdirres_sz);
+}
+
+/*
+ * NFSv2 XDR decode functions
+ *
+ * NFSv2 result types are defined in section 2.2 of RFC 1094:
+ * "NFS: Network File System Protocol Specification".
+ */
+
+static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct nfs_fattr *result)
+{
+ return decode_attrstat(xdr, result);
+}
+
+static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct nfs_diropok *result)
+{
+ return decode_diropres(xdr, result);
+}
+
+/*
+ * 2.2.6. readlinkres
+ *
+ * union readlinkres switch (stat status) {
+ * case NFS_OK:
+ * path data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_path(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.7. readres
+ *
+ * union readres switch (stat status) {
+ * case NFS_OK:
+ * fattr attributes;
+ * nfsdata data;
+ * default:
+ * void;
+ * };
+ */
+static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct nfs_readres *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_fattr(xdr, result->fattr);
+ if (unlikely(error))
+ goto out;
+ error = decode_nfsdata(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct nfs_writeres *result)
+{
+ /* All NFSv2 writes are "file sync" writes */
+ result->verf->committed = NFS_FILE_SYNC;
+ return decode_attrstat(xdr, result->fattr);
+}
+
+/**
+ * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in
+ * the local page cache.
+ * @xdr: XDR stream where entry resides
+ * @entry: buffer to fill in with entry data
+ * @plus: boolean indicating whether this should be a readdirplus entry
+ *
+ * Returns zero if successful, otherwise a negative errno value is
+ * returned.
+ *
+ * This function is not invoked during READDIR reply decoding, but
+ * rather whenever an application invokes the getdents(2) system call
+ * on a directory already in our cache.
+ *
+ * 2.2.17. entry
+ *
+ * struct entry {
+ * unsigned fileid;
+ * filename name;
+ * nfscookie cookie;
+ * entry *nextentry;
+ * };
+ */
+int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ int plus)
+{
+ __be32 *p;
+ int error;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ if (*p++ == xdr_zero) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ if (*p++ == xdr_zero)
+ return -EAGAIN;
+ entry->eof = 1;
+ return -EBADCOOKIE;
+ }
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ entry->ino = be32_to_cpup(p);
+
+ error = decode_filename_inline(xdr, &entry->name, &entry->len);
+ if (unlikely(error))
+ return error;
+
+ /*
+ * The type (size and byte order) of nfscookie isn't defined in
+ * RFC 1094. This implementation assumes that it's an XDR uint32.
+ */
+ entry->prev_cookie = entry->cookie;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ entry->cookie = be32_to_cpup(p);
+
+ entry->d_type = DT_UNKNOWN;
+
+ return 0;
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EAGAIN;
+}
+
+/*
+ * 2.2.17. readdirres
+ *
+ * union readdirres switch (stat status) {
+ * case NFS_OK:
+ * struct {
+ * entry *entries;
+ * bool eof;
+ * } readdirok;
+ * default:
+ * void;
+ * };
+ *
+ * Read the directory contents into the page cache, but don't
+ * touch them. The actual decoding is done by nfs2_decode_dirent()
+ * during subsequent nfs_readdir() calls.
+ */
+static int decode_readdirok(struct xdr_stream *xdr)
+{
+ u32 recvd, pglen;
+ size_t hdrlen;
+
+ pglen = xdr->buf->page_len;
+ hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base;
+ recvd = xdr->buf->len - hdrlen;
+ if (unlikely(pglen > recvd))
+ goto out_cheating;
+out:
+ xdr_read_pages(xdr, pglen);
+ return pglen;
+out_cheating:
+ dprintk("NFS: server cheating in readdir result: "
+ "pglen %u > recvd %u\n", pglen, recvd);
+ pglen = recvd;
+ goto out;
+}
+
+static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req,
+ struct xdr_stream *xdr, void *__unused)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_readdirok(xdr);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+/*
+ * 2.2.18. statfsres
+ *
+ * union statfsres (stat status) {
+ * case NFS_OK:
+ * struct {
+ * unsigned tsize;
+ * unsigned bsize;
+ * unsigned blocks;
+ * unsigned bfree;
+ * unsigned bavail;
+ * } info;
+ * default:
+ * void;
+ * };
+ */
+static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_info_sz << 2);
+ if (unlikely(p == NULL))
+ goto out_overflow;
+ result->tsize = be32_to_cpup(p++);
+ result->bsize = be32_to_cpup(p++);
+ result->blocks = be32_to_cpup(p++);
+ result->bfree = be32_to_cpup(p++);
+ result->bavail = be32_to_cpup(p);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct nfs2_fsstat *result)
+{
+ enum nfs_stat status;
+ int error;
+
+ error = decode_stat(xdr, &status);
+ if (unlikely(error))
+ goto out;
+ if (status != NFS_OK)
+ goto out_default;
+ error = decode_info(xdr, result);
+out:
+ return error;
+out_default:
+ return nfs_stat_to_errno(status);
+}
+
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+int nfs_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ dprintk("NFS: Unrecognized nfs status value: %u\n", status);
+ return nfs_errtbl[i].errno;
+}
+
+#define PROC(proc, argtype, restype, timer) \
+[NFSPROC_##proc] = { \
+ .p_proc = NFSPROC_##proc, \
+ .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \
+ .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \
+ .p_arglen = NFS_##argtype##_sz, \
+ .p_replen = NFS_##restype##_sz, \
+ .p_timer = timer, \
+ .p_statidx = NFSPROC_##proc, \
+ .p_name = #proc, \
+ }
+struct rpc_procinfo nfs_procedures[] = {
+ PROC(GETATTR, fhandle, attrstat, 1),
+ PROC(SETATTR, sattrargs, attrstat, 0),
+ PROC(LOOKUP, diropargs, diropres, 2),
+ PROC(READLINK, readlinkargs, readlinkres, 3),
+ PROC(READ, readargs, readres, 3),
+ PROC(WRITE, writeargs, writeres, 4),
+ PROC(CREATE, createargs, diropres, 0),
+ PROC(REMOVE, removeargs, stat, 0),
+ PROC(RENAME, renameargs, stat, 0),
+ PROC(LINK, linkargs, stat, 0),
+ PROC(SYMLINK, symlinkargs, stat, 0),
+ PROC(MKDIR, createargs, diropres, 0),
+ PROC(RMDIR, diropargs, stat, 0),
+ PROC(READDIR, readdirargs, readdirres, 3),
+ PROC(STATFS, fhandle, statfsres, 0),
+};
+
+struct rpc_version nfs_version2 = {
+ .number = 2,
+ .nrprocs = ARRAY_SIZE(nfs_procedures),
+ .procs = nfs_procedures
+};
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
new file mode 100644
index 00000000000..7ef23979896
--- /dev/null
+++ b/fs/nfs/nfs3acl.c
@@ -0,0 +1,440 @@
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/nfsacl.h>
+
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl;
+ int pos=0, len=0;
+
+# define output(s) do { \
+ if (pos + sizeof(s) <= size) { \
+ memcpy(buffer + pos, s, sizeof(s)); \
+ pos += sizeof(s); \
+ } \
+ len += sizeof(s); \
+ } while(0)
+
+ acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ output("system.posix_acl_access");
+ posix_acl_release(acl);
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ output("system.posix_acl_default");
+ posix_acl_release(acl);
+ }
+ }
+
+# undef output
+
+ if (!buffer || len <= size)
+ return len;
+ return -ERANGE;
+}
+
+ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl;
+ int type, error = 0;
+
+ if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+ type = ACL_TYPE_ACCESS;
+ else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+ type = ACL_TYPE_DEFAULT;
+ else
+ return -EOPNOTSUPP;
+
+ acl = nfs3_proc_getacl(inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ else if (acl) {
+ if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
+ error = -ENODATA;
+ else
+ error = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+ } else
+ error = -ENODATA;
+
+ return error;
+}
+
+int nfs3_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl;
+ int type, error;
+
+ if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+ type = ACL_TYPE_ACCESS;
+ else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+ type = ACL_TYPE_DEFAULT;
+ else
+ return -EOPNOTSUPP;
+
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ error = nfs3_proc_setacl(inode, type, acl);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+int nfs3_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = dentry->d_inode;
+ int type;
+
+ if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
+ type = ACL_TYPE_ACCESS;
+ else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
+ type = ACL_TYPE_DEFAULT;
+ else
+ return -EOPNOTSUPP;
+
+ return nfs3_proc_setacl(inode, type, NULL);
+}
+
+static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
+{
+ if (!IS_ERR(nfsi->acl_access)) {
+ posix_acl_release(nfsi->acl_access);
+ nfsi->acl_access = ERR_PTR(-EAGAIN);
+ }
+ if (!IS_ERR(nfsi->acl_default)) {
+ posix_acl_release(nfsi->acl_default);
+ nfsi->acl_default = ERR_PTR(-EAGAIN);
+ }
+}
+
+void nfs3_forget_cached_acls(struct inode *inode)
+{
+ dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
+ inode->i_ino);
+ spin_lock(&inode->i_lock);
+ __nfs3_forget_cached_acls(NFS_I(inode));
+ spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct posix_acl *acl = ERR_PTR(-EINVAL);
+
+ spin_lock(&inode->i_lock);
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ acl = nfsi->acl_access;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = nfsi->acl_default;
+ break;
+
+ default:
+ goto out;
+ }
+ if (IS_ERR(acl))
+ acl = ERR_PTR(-EAGAIN);
+ else
+ acl = posix_acl_dup(acl);
+out:
+ spin_unlock(&inode->i_lock);
+ dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
+ inode->i_ino, type, acl);
+ return acl;
+}
+
+static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
+ inode->i_ino, acl, dfacl);
+ spin_lock(&inode->i_lock);
+ __nfs3_forget_cached_acls(NFS_I(inode));
+ if (!IS_ERR(acl))
+ nfsi->acl_access = posix_acl_dup(acl);
+ if (!IS_ERR(dfacl))
+ nfsi->acl_default = posix_acl_dup(dfacl);
+ spin_unlock(&inode->i_lock);
+}
+
+struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct page *pages[NFSACL_MAXPAGES] = { };
+ struct nfs3_getaclargs args = {
+ .fh = NFS_FH(inode),
+ /* The xdr layer may allocate pages here. */
+ .pages = pages,
+ };
+ struct nfs3_getaclres res = {
+ 0
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct posix_acl *acl;
+ int status, count;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ status = nfs_revalidate_inode(server, inode);
+ if (status < 0)
+ return ERR_PTR(status);
+ acl = nfs3_get_cached_acl(inode, type);
+ if (acl != ERR_PTR(-EAGAIN))
+ return acl;
+ acl = NULL;
+
+ /*
+ * Only get the access acl when explicitly requested: We don't
+ * need it for access decisions, and only some applications use
+ * it. Applications which request the access acl first are not
+ * penalized from this optimization.
+ */
+ if (type == ACL_TYPE_ACCESS)
+ args.mask |= NFS_ACLCNT|NFS_ACL;
+ if (S_ISDIR(inode->i_mode))
+ args.mask |= NFS_DFACLCNT|NFS_DFACL;
+ if (args.mask == 0)
+ return NULL;
+
+ dprintk("NFS call getacl\n");
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ dprintk("NFS reply getacl: %d\n", status);
+
+ /* pages may have been allocated at the xdr layer. */
+ for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+ __free_page(args.pages[count]);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, res.fattr);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL extension not supported; disabling\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ default:
+ goto getout;
+ }
+ if ((args.mask & res.mask) != args.mask) {
+ status = -EIO;
+ goto getout;
+ }
+
+ if (res.acl_access != NULL) {
+ if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+ posix_acl_release(res.acl_access);
+ res.acl_access = NULL;
+ }
+ }
+ nfs3_cache_acls(inode,
+ (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL),
+ (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
+
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ acl = res.acl_access;
+ res.acl_access = NULL;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = res.acl_default;
+ res.acl_default = NULL;
+ }
+
+getout:
+ posix_acl_release(res.acl_access);
+ posix_acl_release(res.acl_default);
+ nfs_free_fattr(res.fattr);
+
+ if (status != 0) {
+ posix_acl_release(acl);
+ acl = ERR_PTR(status);
+ }
+ return acl;
+}
+
+static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_fattr *fattr;
+ struct page *pages[NFSACL_MAXPAGES];
+ struct nfs3_setaclargs args = {
+ .inode = inode,
+ .mask = NFS_ACL,
+ .acl_access = acl,
+ .pages = pages,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &args,
+ .rpc_resp = &fattr,
+ };
+ int status;
+
+ status = -EOPNOTSUPP;
+ if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+ goto out;
+
+ /* We are doing this here because XDR marshalling does not
+ * return any results, it BUGs. */
+ status = -ENOSPC;
+ if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+ goto out;
+ if (S_ISDIR(inode->i_mode)) {
+ args.mask |= NFS_DFACL;
+ args.acl_default = dfacl;
+ args.len = nfsacl_size(acl, dfacl);
+ } else
+ args.len = nfsacl_size(acl, NULL);
+
+ if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+ unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+ status = -ENOMEM;
+ do {
+ args.pages[args.npages] = alloc_page(GFP_KERNEL);
+ if (args.pages[args.npages] == NULL)
+ goto out_freepages;
+ args.npages++;
+ } while (args.npages < npages);
+ }
+
+ dprintk("NFS call setacl\n");
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out_freepages;
+
+ msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(server->client_acl, &msg, 0);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
+ dprintk("NFS reply setacl: %d\n", status);
+
+ switch (status) {
+ case 0:
+ status = nfs_refresh_inode(inode, fattr);
+ nfs3_cache_acls(inode, acl, dfacl);
+ break;
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ dprintk("NFS_V3_ACL SETACL RPC not supported"
+ "(will not retry)\n");
+ server->caps &= ~NFS_CAP_ACLS;
+ case -ENOTSUPP:
+ status = -EOPNOTSUPP;
+ }
+ nfs_free_fattr(fattr);
+out_freepages:
+ while (args.npages != 0) {
+ args.npages--;
+ __free_page(args.pages[args.npages]);
+ }
+out:
+ return status;
+}
+
+int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct posix_acl *alloc = NULL, *dfacl = NULL;
+ int status;
+
+ if (S_ISDIR(inode->i_mode)) {
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ alloc = dfacl = nfs3_proc_getacl(inode,
+ ACL_TYPE_DEFAULT);
+ if (IS_ERR(alloc))
+ goto fail;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ dfacl = acl;
+ alloc = acl = nfs3_proc_getacl(inode,
+ ACL_TYPE_ACCESS);
+ if (IS_ERR(alloc))
+ goto fail;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else if (type != ACL_TYPE_ACCESS)
+ return -EINVAL;
+
+ if (acl == NULL) {
+ alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ if (IS_ERR(alloc))
+ goto fail;
+ }
+ status = nfs3_proc_setacls(inode, acl, dfacl);
+ posix_acl_release(alloc);
+ return status;
+
+fail:
+ return PTR_ERR(alloc);
+}
+
+int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
+ umode_t mode)
+{
+ struct posix_acl *dfacl, *acl;
+ int error = 0;
+
+ dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(dfacl)) {
+ error = PTR_ERR(dfacl);
+ return (error == -EOPNOTSUPP) ? 0 : error;
+ }
+ if (!dfacl)
+ return 0;
+ acl = posix_acl_dup(dfacl);
+ error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+ if (error < 0)
+ goto out_release_dfacl;
+ error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
+ dfacl : NULL);
+ posix_acl_release(acl);
+out_release_dfacl:
+ posix_acl_release(dfacl);
+ return error;
+}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
new file mode 100644
index 00000000000..91943953a37
--- /dev/null
+++ b/fs/nfs/nfs3proc.c
@@ -0,0 +1,891 @@
+/*
+ * linux/fs/nfs/nfs3proc.c
+ *
+ * Client-side NFSv3 procedures stubs.
+ *
+ * Copyright (C) 1997, Olaf Kirch
+ */
+
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/lockd/bind.h>
+#include <linux/nfs_mount.h>
+#include <linux/freezer.h>
+
+#include "iostat.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PROC
+
+/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
+static int
+nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+ int res;
+ do {
+ res = rpc_call_sync(clnt, msg, flags);
+ if (res != -EJUKEBOX && res != -EKEYEXPIRED)
+ break;
+ freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+ res = -ERESTARTSYS;
+ } while (!fatal_signal_pending(current));
+ return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
+{
+ if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
+ return 0;
+ if (task->tk_status == -EJUKEBOX)
+ nfs_inc_stats(inode, NFSIOS_DELAY);
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return 1;
+}
+
+static int
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO],
+ .rpc_argp = fhandle,
+ .rpc_resp = info,
+ };
+ int status;
+
+ dprintk("%s: call fsinfo\n", __func__);
+ nfs_fattr_init(info->fattr);
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply fsinfo: %d\n", __func__, status);
+ if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_resp = info->fattr;
+ status = rpc_call_sync(client, &msg, 0);
+ dprintk("%s: reply getattr: %d\n", __func__, status);
+ }
+ return status;
+}
+
+/*
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_get_root(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
+/*
+ * One function for each procedure in the NFS protocol.
+ */
+static int
+nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
+ .rpc_argp = fhandle,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("NFS call getattr\n");
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply getattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct iattr *sattr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct nfs3_sattrargs arg = {
+ .fh = NFS_FH(inode),
+ .sattr = sattr,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = fattr,
+ };
+ int status;
+
+ dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status == 0)
+ nfs_setattr_update_inode(inode, sattr);
+ dprintk("NFS reply setattr: %d\n", status);
+ return status;
+}
+
+static int
+nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+{
+ struct nfs3_diropargs arg = {
+ .fh = NFS_FH(dir),
+ .name = name->name,
+ .len = name->len
+ };
+ struct nfs3_diropres res = {
+ .fh = fhandle,
+ .fattr = fattr
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ dprintk("NFS call lookup %s\n", name->name);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ return -ENOMEM;
+
+ nfs_fattr_init(fattr);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ nfs_refresh_inode(dir, res.dir_attr);
+ if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+ msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+ msg.rpc_argp = fhandle;
+ msg.rpc_resp = fattr;
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ }
+ nfs_free_fattr(res.dir_attr);
+ dprintk("NFS reply lookup: %d\n", status);
+ return status;
+}
+
+static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+{
+ struct nfs3_accessargs arg = {
+ .fh = NFS_FH(inode),
+ };
+ struct nfs3_accessres res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = entry->cred,
+ };
+ int mode = entry->mask;
+ int status = -ENOMEM;
+
+ dprintk("NFS call access\n");
+
+ if (mode & MAY_READ)
+ arg.access |= NFS3_ACCESS_READ;
+ if (S_ISDIR(inode->i_mode)) {
+ if (mode & MAY_WRITE)
+ arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE;
+ if (mode & MAY_EXEC)
+ arg.access |= NFS3_ACCESS_LOOKUP;
+ } else {
+ if (mode & MAY_WRITE)
+ arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND;
+ if (mode & MAY_EXEC)
+ arg.access |= NFS3_ACCESS_EXECUTE;
+ }
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out;
+
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, res.fattr);
+ if (status == 0) {
+ entry->mask = 0;
+ if (res.access & NFS3_ACCESS_READ)
+ entry->mask |= MAY_READ;
+ if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE))
+ entry->mask |= MAY_WRITE;
+ if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
+ entry->mask |= MAY_EXEC;
+ }
+ nfs_free_fattr(res.fattr);
+out:
+ dprintk("NFS reply access: %d\n", status);
+ return status;
+}