diff options
author | Anton Arapov <anton@redhat.com> | 2012-04-16 10:05:28 +0200 |
---|---|---|
committer | Anton Arapov <anton@redhat.com> | 2012-04-16 10:05:28 +0200 |
commit | b4b6116a13633898cf868f2f103c96a90c4c20f8 (patch) | |
tree | 93d1b7e2cfcdf473d8d4ff3ad141fa864f8491f6 /fs/nfs | |
parent | edd4be777c953e5faafc80d091d3084b4343f5d3 (diff) | |
download | kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.gz kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.xz kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.zip |
fedora kernel: d9aad82f3319f3cfd1aebc01234254ef0c37ad84v3.3.2-1
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'fs/nfs')
62 files changed, 53024 insertions, 0 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig new file mode 100644 index 00000000000..021d2cf6938 --- /dev/null +++ b/fs/nfs/Kconfig @@ -0,0 +1,134 @@ +config NFS_FS + tristate "NFS client support" + depends on INET && FILE_LOCKING + select LOCKD + select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL + help + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. + + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. + + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. + + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. + + If unsure, say N. + +config NFS_V3 + bool "NFS client support for NFS version 3" + depends on NFS_FS + help + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3_ACL + bool "NFS client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. + + If unsure, say N. + +config NFS_V4 + bool "NFS client support for NFS version 4" + depends on NFS_FS + select SUNRPC_GSS + help + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. + + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say Y. + +config NFS_V4_1 + bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select SUNRPC_BACKCHANNEL + select PNFS_FILE_LAYOUT + help + This option enables support for minor version 1 of the NFSv4 protocol + (RFC 5661) in the kernel's NFS client. + + If unsure, say N. + +config PNFS_FILE_LAYOUT + tristate + +config PNFS_BLOCK + tristate + depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM + default m + +config PNFS_OBJLAYOUT + tristate + depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD + default m + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + <file:Documentation/filesystems/nfs/nfsroot.txt>. + + Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support" + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager + +config NFS_USE_LEGACY_DNS + bool "Use the legacy NFS DNS resolver" + depends on NFS_V4 + help + The kernel now provides a method for translating a host name into an + IP address. Select Y here if you would rather use your own DNS + resolver script. + + If unsure, say N + +config NFS_USE_KERNEL_DNS + bool + depends on NFS_V4 && !NFS_USE_LEGACY_DNS + select DNS_RESOLVER + select KEYS + default y diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile new file mode 100644 index 00000000000..b58613d0abb --- /dev/null +++ b/fs/nfs/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the Linux nfs filesystem routines. +# + +obj-$(CONFIG_NFS_FS) += nfs.o + +nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ + direct.o pagelist.o proc.o read.o symlink.o unlink.o \ + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o +nfs-$(CONFIG_ROOT_NFS) += nfsroot.o +nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o +nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o +nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o + +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 00000000000..d5815505c02 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 00000000000..48cfac31f64 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1084 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h> /* struct bio */ +#include <linux/buffer_head.h> /* various write calls */ +#include <linux/prefetch.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +struct dentry *bl_device_pipe; +wait_queue_head_t bl_wq; + +static void print_page(struct page *page) +{ + dprintk("PRINTPAGE page %p\n", page); + dprintk(" PagePrivate %d\n", PagePrivate(page)); + dprintk(" PageUptodate %d\n", PageUptodate(page)); + dprintk(" PageError %d\n", PageError(page)); + dprintk(" PageDirty %d\n", PageDirty(page)); + dprintk(" PageReferenced %d\n", PageReferenced(page)); + dprintk(" PageLocked %d\n", PageLocked(page)); + dprintk(" PageWriteback %d\n", PageWriteback(page)); + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); + dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_NONE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + void (*pnfs_callback) (void *data, int num_se); + void *data; + int bse_count; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_NOFS); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + rv->bse_count = 0; + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data, p->bse_count); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", + bio->bi_size, (unsigned long long)bio->bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + struct bio *bio; + + npg = min(npg, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, npg); + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } + + if (bio) { + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + } + return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ +retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (uptodate) + SetPageUptodate(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate) { + if (!rdata->pnfs_error) + rdata->pnfs_error = -EIO; + pnfs_set_lo_fail(rdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data, int unused) +{ + struct nfs_read_data *rdata = data; + + rdata->task.tk_status = rdata->pnfs_error; + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); + schedule_work(&rdata->task.u.tk_work); +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_read_data *rdata) +{ + int i, hole; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = rdata->args.offset; + size_t count = rdata->args.count; + struct page **pages = rdata->args.pages; + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + + dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, + rdata->npages, f_offset, count); + + par = alloc_parallel(rdata); + if (!par) + goto use_mds; + par->pnfs_callback = bl_end_par_io_read; + /* At this point, we can no longer jump to use_mds */ + + isect = (sector_t) (f_offset >> SECTOR_SHIFT); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < rdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bl_put_extent(cow_read); + bio = bl_submit_bio(READ, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), + isect, &cow_read); + if (!be) { + rdata->pnfs_error = -EIO; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + if (cow_read) { + sector_t cow_length = cow_read->be_length - + (isect - cow_read->be_f_offset); + extent_length = min(extent_length, cow_length); + } + } + hole = is_hole(be, isect); + if (hole && !cow_read) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + print_page(pages[i]); + SetPageUptodate(pages[i]); + } else { + struct pnfs_block_extent *be_read; + + be_read = (hole && cow_read) ? cow_read : be; + bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, + isect, pages[i], be_read, + bl_end_io_read, par); + if (IS_ERR(bio)) { + rdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + } + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { + rdata->res.eof = 1; + rdata->res.count = rdata->inode->i_size - f_offset; + } else { + rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; + } +out: + bl_put_extent(be); + bl_put_extent(cow_read); + bl_submit_bio(READ, bio); + put_parallel(par); + return PNFS_ATTEMPTED; + + use_mds: + dprintk("Giving up and using normal NFS\n"); + return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + struct pnfs_block_short_extent *se; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= SECTOR_SHIFT; + while (isect < end) { + sector_t len; + be = bl_find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + se = bl_pop_one_short_extent(be->be_inval); + BUG_ON(!se); + bl_mark_for_commit(be, isect, len, se); + } + isect += len; + bl_put_extent(be); + } +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + /* This is the zeroing page we added */ + end_page_writeback(page); + page_cache_release(page); + } while (bvec >= bio->bi_io_vec); + + if (unlikely(!uptodate)) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + pnfs_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_end_io_write(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + if (!uptodate) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + pnfs_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + if (likely(!wdata->pnfs_error)) { + /* Marks for LAYOUTCOMMIT */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + wdata->args.offset, wdata->args.count); + } + pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data, int num_se) +{ + struct nfs_write_data *wdata = data; + + if (unlikely(wdata->pnfs_error)) { + bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval, + num_se); + } + + wdata->task.tk_status = wdata->pnfs_error; + wdata->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); + schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (unsigned long long)isect, (long)bh->b_blocknr, + bh->b_size); + return; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ + struct buffer_head *bh = NULL; + int ret = 0; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + BUG_ON(PageUptodate(page)); + if (!cow_read) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto cleanup; + } + + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; + map_block(bh, isect, cow_read); + if (!bh_uptodate_or_lock(bh)) + ret = bh_submit_read(bh); + if (ret) + goto cleanup; + SetPageUptodate(page); + +cleanup: + bl_put_extent(cow_read); + if (bh) + free_buffer_head(bh); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + +/* Find or create a zeroing page marked being writeback. + * Return ERR_PTR on error, NULL to indicate skip this page and page itself + * to indicate write out. + */ +static struct page * +bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, + struct pnfs_block_extent *cow_read) +{ + struct page *page; + int locked = 0; + page = find_get_page(inode->i_mapping, index); + if (page) + goto check_page; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s oom\n", __func__); + return ERR_PTR(-ENOMEM); + } + locked = 1; + +check_page: + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + */ + if (PageDirty(page) || PageWriteback(page)) { + print_page(page); + if (locked) + unlock_page(page); + page_cache_release(page); + return NULL; + } + + if (!locked) { + lock_page(page); + locked = 1; + goto check_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + return page; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_write_data *wdata, int sync) +{ + int i, ret, npg_zero, pg_index, last = 0; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, last_isect = 0, extent_length = 0; + struct parallel_io *par; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; + struct page **pages = wdata->args.pages; + struct page *page; + pgoff_t index; + u64 temp; + int npg_per_block = + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + /* At this point, wdata->pages is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. + */ + par = alloc_parallel(wdata); + if (!par) + goto out_mds; + par->pnfs_callback = bl_end_par_io_write; + /* At this point, have to be more careful with error handling */ + + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); + if (!be || !is_writable(be, isect)) { + dprintk("%s no matching extents!\n", __func__); + goto out_mds; + } + + /* First page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else + goto out_mds; + temp = offset >> PAGE_CACHE_SHIFT; + npg_zero = do_div(temp, npg_per_block); + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: + dprintk("%s need to zero %d pages\n", __func__, npg_zero); + for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } + /* page ref released in bl_end_io_write_zero */ + index = isect >> PAGE_CACHE_SECTOR_SHIFT; + dprintk("%s zero %dth page: index %lu isect %llu\n", + __func__, npg_zero, index, + (unsigned long long)isect); + page = bl_find_get_zeroing_page(wdata->inode, index, + cow_read); + if (unlikely(IS_ERR(page))) { + wdata->pnfs_error = PTR_ERR(page); + goto out; + } else if (page == NULL) + goto next_page; + + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = ret; + goto out; + } + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else { + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = -ENOMEM; + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); + + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, + isect, page, be, + bl_end_io_write_zero, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } +next_page: + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if (last) + goto write_done; + } + bio = bl_submit_bio(WRITE, bio); + + /* Middle pages */ + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + for (i = pg_index; i < wdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), + isect, NULL); + if (!be || !is_writable(be, isect)) { + wdata->pnfs_error = -EINVAL; + goto out; + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent( + be->be_inval))) + par->bse_count++; + else { + wdata->pnfs_error = -ENOMEM; + goto out; + } + } + extent_length = be->be_length - + (isect - be->be_f_offset); + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + wdata->pnfs_error = ret; + goto out; + } + } + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, + isect, pages[i], be, + bl_end_io_write, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + isect += PAGE_CACHE_SECTORS; + last_isect = isect; + extent_length -= PAGE_CACHE_SECTORS; + } + + /* Last page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + bio = bl_submit_bio(WRITE, bio); + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; + npg_zero = npg_per_block - do_div(temp, npg_per_block); + if (npg_zero < npg_per_block) { + last = 1; + goto fill_invalid_ext; + } + } + +write_done: + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); + if (count < wdata->res.count) { + wdata->res.count = count; + } +out: + bl_put_extent(be); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +out_mds: + bl_put_extent(be); + kfree(par); + return PNFS_NOT_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ + int i; + struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + while (!list_empty(&bl->bl_extents[i])) { + be = list_first_entry(&bl->bl_extents[i], + struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + } + spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ + struct pnfs_inval_tracking *pos, *temp; + struct pnfs_block_short_extent *se, *stemp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } + + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + } + return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + + dprintk("%s enter\n", __func__); + release_extents(bl, NULL); + release_inval_marks(&bl->bl_inval); + kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), gfp_flags); + if (!bl) + return NULL; + spin_lock_init(&bl->bl_ext_lock); + INIT_LIST_HEAD(&bl->bl_extents[0]); + INIT_LIST_HEAD(&bl->bl_extents[1]); + INIT_LIST_HEAD(&bl->bl_commit); + INIT_LIST_HEAD(&bl->bl_committing); + bl->bl_count = 0; + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; + BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *lseg; + int status; + + dprintk("%s enter\n", __func__); + lseg = kzalloc(sizeof(*lseg), gfp_flags); + if (!lseg) + return ERR_PTR(-ENOMEM); + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); + if (status) { + /* We don't want to call the full-blown bl_free_lseg, + * since on error extents were not touched. + */ + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ + if (mid) { + struct pnfs_block_dev *dev, *tmp; + + /* No need to take bm_lock as we are last user freeing bm_devlist */ + list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { + list_del(&dev->bm_node); + bl_free_block_dev(dev); + } + kfree(mid); + } +} + +/* This is mostly copied from the filelayout's get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + struct nfs4_deviceid *d_id) +{ + struct pnfs_device *dev; + struct pnfs_block_dev *rv; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + int i, rc; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + + dev = kmalloc(sizeof(*dev), GFP_NOFS); + if (!dev) { + dprintk("%s kmalloc failed\n", __func__); + return ERR_PTR(-ENOMEM); + } + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); + goto out_free; + } + } + + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); + dev->layout_type = LAYOUT_BLOCK_VOLUME; + dev->pages = pages; + dev->pgbase = 0; + dev->pglen = PAGE_SIZE * max_pages; + dev->mincount = 0; + + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); + rc = nfs4_proc_getdeviceinfo(server, dev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) { + rv = ERR_PTR(rc); + goto out_free; + } + + rv = nfs4_blk_decode_device(server, dev); + out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(dev); + return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ + struct block_mount_id *b_mt_id = NULL; + struct pnfs_devicelist *dlist = NULL; + struct pnfs_block_dev *bdev; + LIST_HEAD(block_disklist); + int status, i; + + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); + if (!b_mt_id) { + status = -ENOMEM; + goto out_error; + } + /* Initialize nfs4 block layout mount id */ + spin_lock_init(&b_mt_id->bm_lock); + INIT_LIST_HEAD(&b_mt_id->bm_devlist); + + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); + if (!dlist) { + status = -ENOMEM; + goto out_error; + } + dlist->eof = 0; + while (!dlist->eof) { + status = nfs4_proc_getdevicelist(server, fh, dlist); + if (status) + goto out_error; + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", + __func__, dlist->num_devs, dlist->eof); + for (i = 0; i < dlist->num_devs; i++) { + bdev = nfs4_blk_get_deviceinfo(server, fh, + &dlist->dev_id[i]); + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); + goto out_error; + } + spin_lock(&b_mt_id->bm_lock); + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); + spin_unlock(&b_mt_id->bm_lock); + } + } + dprintk("%s SUCCESS\n", __func__); + server->pnfs_ld_data = b_mt_id; + + out_return: + kfree(dlist); + return status; + + out_error: + free_blk_mountid(b_mt_id); + goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ + struct block_mount_id *b_mt_id = server->pnfs_ld_data; + + dprintk("%s enter\n", __func__); + free_blk_mountid(b_mt_id); + dprintk("%s RETURNS\n", __func__); + return 0; +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .encode_layoutcommit = bl_encode_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .clear_layoutdriver = bl_clear_layoutdriver, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static int __init nfs4blocklayout_init(void) +{ + struct vfsmount *mnt; + struct path path; + int ret; + + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + ret = pnfs_register_layoutdriver(&blocklayout_type); + if (ret) + goto out; + + init_waitqueue_head(&bl_wq); + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) { + ret = PTR_ERR(mnt); + goto out_remove; + } + + ret = vfs_path_lookup(mnt->mnt_root, + mnt, + NFS_PIPE_DIRNAME, 0, &path); + if (ret) + goto out_putrpc; + + bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, + &bl_upcall_ops, 0); + path_put(&path); + if (IS_ERR(bl_device_pipe)) { + ret = PTR_ERR(bl_device_pipe); + goto out_putrpc; + } +out: + return ret; + +out_putrpc: + rpc_put_mount(); +out_remove: + pnfs_unregister_layoutdriver(&blocklayout_type); + return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + pnfs_unregister_layoutdriver(&blocklayout_type); + rpc_unlink(bl_device_pipe); + rpc_put_mount(); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 00000000000..e31a2df28e7 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,211 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../pnfs.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) + +struct block_mount_id { + spinlock_t bm_lock; /* protects list */ + struct list_head bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { + struct list_head bm_node; + struct nfs4_deviceid bm_mdevid; /* associated devid */ + struct block_device *bm_mdev; /* meta device itself */ +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { + spinlock_t im_lock; + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ + struct list_head im_extents; /* Short extents for INVAL->RW conversion */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + struct kref be_refcnt; + struct list_head be_node; /* link into lseg list */ + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ + struct block_device *be_mdev; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct nfs4_deviceid bse_devid; + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + INIT_LIST_HEAD(&marks->im_extents); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); +} + +enum extentclass4 { + RW_EXTENT = 0, /* READWRTE and INVAL */ + RO_EXTENT = 1, /* READ and NONE */ + EXTENT_LISTS = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + spinlock_t bl_ext_lock; /* Protects list manipulation */ + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ + struct list_head bl_commit; /* Needs layout commit */ + struct list_head bl_committing; /* Layout committing */ + unsigned int bl_count; /* entries in bl_commit */ + sector_t bl_blocksize; /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_dev_msg { + int32_t status; + uint32_t major, minor; +}; + +struct bl_msg_hdr { + u8 type; + u16 totallen; /* length of entire message, including hdr itself */ +}; + +extern struct dentry *bl_device_pipe; +extern wait_queue_head_t bl_wq; + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +struct block_device *nfs4_blkdev_get(dev_t dev); +int nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new); +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 00000000000..d08ba9107fd --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,391 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "%s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; +} + +/* Open a block_device by device number. */ +struct block_device *nfs4_blkdev_get(dev_t dev) +{ + struct block_device *bd; + + dprintk("%s enter\n", __func__); + bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(bd)) + goto fail; + return bd; +fail: + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + return NULL; +} + +/* + * Release the block device + */ +int nfs4_blkdev_put(struct block_device *bdev) +{ + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + return blkdev_put(bdev, FMODE_READ); +} + +static struct bl_dev_msg bl_mount_reply; + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&bl_wq); + + return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + if (msg->errno >= 0) + return; + wake_up(&bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev) +{ + struct pnfs_block_dev *rv; + struct block_device *bd = NULL; + struct rpc_pipe_msg msg; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_MOUNT, + .totallen = dev->mincount, + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + struct bl_dev_msg *reply = &bl_mount_reply; + int offset, len, i, rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, + dev->mincount); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); + if (!msg.data) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + len = dev->mincount; + offset = sizeof(bl_msg); + for (i = 0; len > 0; i++) { + memcpy(&dataptr[offset], page_address(dev->pages[i]), + len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + offset += PAGE_CACHE_SIZE; + } + msg.len = sizeof(bl_msg) + dev->mincount; + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&bl_wq, &wq); + rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); + if (rc < 0) { + remove_wait_queue(&bl_wq, &wq); + rv = ERR_PTR(rc); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + dprintk("%s failed to open device: %d\n", + __func__, reply->status); + rv = ERR_PTR(-EINVAL); + goto out; + } + + bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); + if (IS_ERR(bd)) { + rc = PTR_ERR(bd); + dprintk("%s failed to open device : %d\n", __func__, rc); + rv = ERR_PTR(rc); + goto out; + } + + rv = kzalloc(sizeof(*rv), GFP_NOFS); + if (!rv) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + rv->bm_mdev = bd; + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); + dprintk("%s Created device %s with bd_block_size %u\n", + __func__, + bd->bd_disk->disk_name, + bd->bd_block_size); + +out: + kfree(msg.data); + return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, + struct nfs4_deviceid *id) +{ + struct block_device *rv = NULL; + struct block_mount_id *mid; + struct pnfs_block_dev *dev; + + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); + mid = BLK_ID(lo); + spin_lock(&mid->bm_lock); + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { + if (memcmp(id->data, dev->bm_mdevid.data, + NFS4_DEVICEID4_SIZE) == 0) { + rv = dev->bm_mdev; + goto out; + } + } + out: + spin_unlock(&mid->bm_lock); + dprintk("%s returning %p\n", __func__, rv); + return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int i, status = -EIO; + uint32_t count; + struct pnfs_block_extent *be = NULL, *save; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err; + + count = be32_to_cpup(p++); + + dprintk("%s enter, number of extents %i\n", __func__, count); + p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); + if (unlikely(!p)) + goto out_err; + + /* Decode individual extents, putting them in temporary + * staging area until whole layout is decoded to make error + * recovery easier. + */ + for (i = 0; i < count; i++) { + be = bl_alloc_extent(); + if (!be) { + status = -ENOMEM; + goto out_err; + } + memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + be->be_mdev = translate_devid(lo, &be->be_devid); + if (!be->be_mdev) + goto out_err; + + /* The next three values are read in as bytes, + * but stored as 512-byte sector lengths + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_err; + be->be_state = be32_to_cpup(p++); + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + be->be_inval = &bl->bl_inval; + if (verify_extent(be, &lv)) { + dprintk("%s verify failed\n", __func__); + goto out_err; + } + list_add_tail(&be->be_node, &extents); + } + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + be = NULL; + goto out_err; + } + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + be = NULL; + goto out_err; + } + /* Extents decoded properly, now try to merge them in to + * existing layout extents. + */ + spin_lock(&bl->bl_ext_lock); + list_for_each_entry_safe(be, save, &extents, be_node) { + list_del(&be->be_node); + status = bl_add_merge_extent(bl, be); + if (status) { + spin_unlock(&bl->bl_ext_lock); + /* This is a fairly catastrophic error, as the + * entire layout extent lists are now corrupted. + * We should have some way to distinguish this. + */ + be = NULL; + goto out_err; + } + } + spin_unlock(&bl->bl_ext_lock); + status = 0; + out: + __free_page(scratch); + dprintk("%s returns %i\n", __func__, status); + return status; + + out_err: + bl_put_extent(be); + while (!list_empty(&extents)) { + be = list_first_entry(&extents, struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 00000000000..d055c755807 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,111 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Fred Isaman <iisaman@umich.edu> + * Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void dev_remove(dev_t dev) +{ + struct rpc_pipe_msg msg; + struct bl_dev_msg bl_umount_request; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_UMOUNT, + .totallen = sizeof(bl_umount_request), + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + + dprintk("Entering %s\n", __func__); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + if (!msg.data) + goto out; + + memset(&bl_umount_request, 0, sizeof(bl_umount_request)); + bl_umount_request.major = MAJOR(dev); + bl_umount_request.minor = MINOR(dev); + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); + msg.len = sizeof(bl_msg) + bl_msg.totallen; + + add_wait_queue(&bl_wq, &wq); + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + remove_wait_queue(&bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + +out: + kfree(msg.data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ + int rv; + + dprintk("%s Releasing\n", __func__); + rv = nfs4_blkdev_put(bdev->bm_mdev); + if (rv) + printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", + __func__, rv); + + dev_remove(bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ + if (bdev) { + if (bdev->bm_mdev) { + dprintk("%s Removing DM device: %d:%d\n", + __func__, + MAJOR(bdev->bm_mdev->bd_dev), + MINOR(bdev->bm_mdev->bd_dev)); + nfs4_blk_metadev_release(bdev); + } + kfree(bdev); + } +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 00000000000..1abac09f7cd --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,909 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + new = storage; + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct pnfs_inval_markings *marks, + u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + struct my_tree *tree = &marks->im_tree; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_NOFS); + if (!storage[i]) + goto out_cleanup; + } + + spin_lock_bh(&marks->im_lock); + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + spin_unlock_bh(&marks->im_lock); + + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock_bh(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock_bh(&marks->im_lock); + return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (pos->it_sector < tree->mtt_step_size || expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock_bh(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock_bh(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Currently assumes offset is page-aligned + */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + sector_t start, end; + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(marks, start, end - start)) + goto outerr; + + spin_lock_bh(&marks->im_lock); + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + spin_unlock_bh(&marks->im_lock); + + return 0; + +out_unlock: + spin_unlock_bh(&marks->im_lock); +outerr: + return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock_bh(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock_bh(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +static void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + ifdebug(FACILITY) { + printk(KERN_DEBUG "****************\n"); + printk(KERN_DEBUG "Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); + printk(KERN_DEBUG "****************\n"); + } +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + * new will be freed, either by this function or add_to_commitlist if they + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +static void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[bl_choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) + break; + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { + /* extend new to fully replace be */ + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* FIXME - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + bl_put_extent(new); + return -EIO; +} + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + bl_put_extent(be); + else + cow = be; + break; + } + } + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + list_del(&lce->bse_node); + list_add_tail(&lce->bse_node, &bl->bl_committing); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + bl_put_extent(prev); + list_del(&be->be_node); + bl_put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = bl_find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + bl_put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + bl_put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + list_del(&lce->bse_node); + + kfree(lce); + } else { + list_del(&lce->bse_node); + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} + +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *new; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (unlikely(!new)) + return -ENOMEM; + + spin_lock_bh(&marks->im_lock); + list_add(&new->bse_node, &marks->im_extents); + spin_unlock_bh(&marks->im_lock); + + return 0; +} + +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *rv = NULL; + + spin_lock_bh(&marks->im_lock); + if (!list_empty(&marks->im_extents)) { + rv = list_entry((&marks->im_extents)->next, + struct pnfs_block_short_extent, bse_node); + list_del_init(&rv->bse_node); + } + spin_unlock_bh(&marks->im_lock); + + return rv; +} + +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) +{ + struct pnfs_block_short_extent *se = NULL, *tmp; + + if (num_to_free <= 0) + return; + + spin_lock(&marks->im_lock); + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + if (--num_to_free == 0) + break; + } + spin_unlock(&marks->im_lock); + + BUG_ON(num_to_free > 0); +} diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 00000000000..c98b439332f --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,140 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct vfsmount *mnt; + struct path path; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd); + path_put(&path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 00000000000..7cf6cafcc00 --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <linux/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c new file mode 100644 index 00000000000..516f3375e06 --- /dev/null +++ b/fs/nfs/callback.c @@ -0,0 +1,403 @@ +/* + * linux/fs/nfs/callback.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback handling + */ + +#include <linux/completion.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/nfs_fs.h> +#include <linux/mutex.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/bc_xprt.h> + +#include <net/inet_sock.h> + +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +struct nfs_callback_data { + unsigned int users; + struct svc_serv *serv; + struct svc_rqst *rqst; + struct task_struct *task; +}; + +static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; +static DEFINE_MUTEX(nfs_callback_mutex); +static struct svc_program nfs4_callback_program; + +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +unsigned short nfs_callback_tcpport6; +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); + +/* + * This is the NFSv4 callback kernel thread. + */ +static int +nfs4_callback_svc(void *vrqstp) +{ + int err, preverr = 0; + struct svc_rqst *rqstp = vrqstp; + + set_freezable(); + + while (!kthread_should_stop()) { + /* + * Listen for a request on the socket + */ + err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); + if (err == -EAGAIN || err == -EINTR) { + preverr = err; + continue; + } + if (err < 0) { + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; + } + preverr = err; + svc_process(rqstp); + } + return 0; +} + +/* + * Prepare to bring up the NFSv4 callback service + */ +struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv) +{ + int ret; + + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport, PF_INET); + + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u)\n", + nfs_callback_tcpport6, PF_INET6); + } else if (ret == -EAFNOSUPPORT) + ret = 0; + else + goto out_err; + + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + +out_err: + if (ret == 0) + ret = -ENOMEM; + return ERR_PTR(ret); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * The callback service for NFSv4.1 callbacks + */ +static int +nfs41_callback_svc(void *vrqstp) +{ + struct svc_rqst *rqstp = vrqstp; + struct svc_serv *serv = rqstp->rq_server; + struct rpc_rqst *req; + int error; + DEFINE_WAIT(wq); + + set_freezable(); + + while (!kthread_should_stop()) { + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + spin_lock_bh(&serv->sv_cb_lock); + if (!list_empty(&serv->sv_cb_list)) { + req = list_first_entry(&serv->sv_cb_list, + struct rpc_rqst, rq_bc_list); + list_del(&req->rq_bc_list); + spin_unlock_bh(&serv->sv_cb_lock); + dprintk("Invoking bc_svc_process()\n"); + error = bc_svc_process(serv, req, rqstp); + dprintk("bc_svc_process() returned w/ error code= %d\n", + error); + } else { + spin_unlock_bh(&serv->sv_cb_lock); + schedule(); + } + finish_wait(&serv->sv_cb_waitq, &wq); + } + return 0; +} + +/* + * Bring up the NFSv4.1 callback service + */ +struct svc_rqst * +nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +{ + struct svc_rqst *rqstp; + int ret; + + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); + if (ret < 0) { + rqstp = ERR_PTR(ret); + goto out; + } + + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; + + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(rqstp)) { + svc_xprt_put(serv->sv_bc_xprt); + serv->sv_bc_xprt = NULL; + } +out: + dprintk("--> %s return %ld\n", __func__, + IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); + return rqstp; +} + +static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, + struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + if (minorversion) { + *rqstpp = nfs41_callback_up(serv, xprt); + *callback_svc = nfs41_callback_svc; + } + return minorversion; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct nfs_callback_data *cb_info) +{ + if (minorversion) + xprt->bc_serv = cb_info->serv; +} +#else +static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, + struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + return 0; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct nfs_callback_data *cb_info) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ + struct svc_serv *serv = NULL; + struct svc_rqst *rqstp; + int (*callback_svc)(void *vrqstp); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + char svc_name[12]; + int ret = 0; + int minorversion_setup; + + mutex_lock(&nfs_callback_mutex); + if (cb_info->users++ || cb_info->task != NULL) { + nfs_callback_bc_serv(minorversion, xprt, cb_info); + goto out; + } + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + if (!serv) { + ret = -ENOMEM; + goto out_err; + } + + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, + serv, xprt, &rqstp, &callback_svc); + if (!minorversion_setup) { + /* v4.0 callback setup */ + rqstp = nfs4_callback_up(serv); + callback_svc = nfs4_callback_svc; + } + + if (IS_ERR(rqstp)) { + ret = PTR_ERR(rqstp); + goto out_err; + } + + svc_sock_update_bufs(serv); + + sprintf(svc_name, "nfsv4.%u-svc", minorversion); + cb_info->serv = serv; + cb_info->rqst = rqstp; + cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + if (IS_ERR(cb_info->task)) { + ret = PTR_ERR(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->rqst = NULL; + cb_info->task = NULL; + goto out_err; + } +out: + /* + * svc_create creates the svc_serv with sv_nrthreads == 1, and then + * svc_prepare_thread increments that. So we need to call svc_destroy + * on both success and failure so that the refcount is 1 when the + * thread exits. + */ + if (serv) + svc_destroy(serv); + mutex_unlock(&nfs_callback_mutex); + return ret; +out_err: + dprintk("NFS: Couldn't create callback socket or server thread; " + "err = %d\n", ret); + cb_info->users--; + goto out; +} + +/* + * Kill the callback thread if it's no longer being used. + */ +void nfs_callback_down(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + + mutex_lock(&nfs_callback_mutex); + cb_info->users--; + if (cb_info->users == 0 && cb_info->task != NULL) { + kthread_stop(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->serv = NULL; + cb_info->rqst = NULL; + cb_info->task = NULL; + } + mutex_unlock(&nfs_callback_mutex); +} + +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) +{ + struct rpc_clnt *r = clp->cl_rpcclient; + char *p = svc_gss_principal(rqstp); + + if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) + return 1; + + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ + if (clp->cl_minorversion != 0) + return 0; + /* + * It might just be a normal user principal, in which case + * userspace won't bother to tell us the name at all. + */ + if (p == NULL) + return 0; + + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ + + if (memcmp(p, "nfs@", 4) != 0) + return 0; + p += 4; + if (strcmp(p, r->cl_server) != 0) + return 0; + return 1; +} + +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */ +static int nfs_callback_authenticate(struct svc_rqst *rqstp) +{ + switch (rqstp->rq_authop->flavour) { + case RPC_AUTH_NULL: + if (rqstp->rq_proc != CB_NULL) + return SVC_DROP; + break; + case RPC_AUTH_GSS: + /* No RPC_AUTH_GSS support yet in NFSv4.1 */ + if (svc_is_backchannel(rqstp)) + return SVC_DROP; + } + return SVC_OK; +} + +/* + * Define NFS4 callback program + */ +static struct svc_version *nfs4_callback_version[] = { + [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, +}; + +static struct svc_stat nfs4_callback_stats; + +static struct svc_program nfs4_callback_program = { + .pg_prog = NFS4_CALLBACK, /* RPC service number */ + .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ + .pg_vers = nfs4_callback_version, /* version table */ + .pg_name = "NFSv4 callback", /* service name */ + .pg_class = "nfs", /* authentication class */ + .pg_stats = &nfs4_callback_stats, + .pg_authenticate = nfs_callback_authenticate, +}; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h new file mode 100644 index 00000000000..c89d3b9e483 --- /dev/null +++ b/fs/nfs/callback.h @@ -0,0 +1,213 @@ +/* + * linux/fs/nfs/callback.h + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback definitions + */ +#ifndef __LINUX_FS_NFS_CALLBACK_H +#define __LINUX_FS_NFS_CALLBACK_H +#include <linux/sunrpc/svc.h> + +#define NFS4_CALLBACK 0x40000000 +#define NFS4_CALLBACK_XDRSIZE 2048 +#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) + +enum nfs4_callback_procnum { + CB_NULL = 0, + CB_COMPOUND = 1, +}; + +enum nfs4_callback_opnum { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, +/* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_ILLEGAL = 10044, +}; + +struct cb_process_state { + __be32 drc_status; + struct nfs_client *clp; + int slotid; +}; + +struct cb_compound_hdr_arg { + unsigned int taglen; + const char *tag; + unsigned int minorversion; + unsigned int cb_ident; /* v4.0 callback identifier */ + unsigned nops; +}; + +struct cb_compound_hdr_res { + __be32 *status; + unsigned int taglen; + const char *tag; + __be32 *nops; +}; + +struct cb_getattrargs { + struct sockaddr *addr; + struct nfs_fh fh; + uint32_t bitmap[2]; +}; + +struct cb_getattrres { + __be32 status; + uint32_t bitmap[2]; + uint64_t size; + uint64_t change_attr; + struct timespec ctime; + struct timespec mtime; +}; + +struct cb_recallargs { + struct sockaddr *addr; + struct nfs_fh fh; + nfs4_stateid stateid; + uint32_t truncate; +}; + +#if defined(CONFIG_NFS_V4_1) + +struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +}; + +struct referring_call_list { + struct nfs4_sessionid rcl_sessionid; + uint32_t rcl_nrefcalls; + struct referring_call *rcl_refcalls; +}; + +struct cb_sequenceargs { + struct sockaddr *csa_addr; + struct nfs4_sessionid csa_sessionid; + uint32_t csa_sequenceid; + uint32_t csa_slotid; + uint32_t csa_highestslotid; + uint32_t csa_cachethis; + uint32_t csa_nrclists; + struct referring_call_list *csa_rclists; +}; + +struct cb_sequenceres { + __be32 csr_status; + struct nfs4_sessionid csr_sessionid; + uint32_t csr_sequenceid; + uint32_t csr_slotid; + uint32_t csr_highestslotid; + uint32_t csr_target_highestslotid; +}; + +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps); + +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_max_slots; +}; +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + union { + struct { + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_range; + nfs4_stateid cbl_stateid; + }; + struct nfs_fsid cbl_fsid; + }; +}; + +extern __be32 nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); + +extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); + +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct nfs4_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +struct cb_devicenotifyargs { + int ndevs; + struct cb_devicenotifyitem *devs; +}; + +extern __be32 nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps); + +#endif /* CONFIG_NFS_V4_1 */ +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps); +#ifdef CONFIG_NFS_V4 +extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); +extern void nfs_callback_down(int minorversion); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp); +#endif /* CONFIG_NFS_V4 */ +/* + * nfs41: Callbacks are expected to not cause substantial latency, + * so we limit their concurrency to 1 by setting up the maximum number + * of slots for the backchannel. + */ +#define NFS41_BC_MIN_CALLBACKS 1 +#define NFS41_BC_MAX_CALLBACKS 1 + +extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_tcpport; +extern unsigned short nfs_callback_tcpport6; + +#endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c new file mode 100644 index 00000000000..54cea8ad5a7 --- /dev/null +++ b/fs/nfs/callback_proc.c @@ -0,0 +1,576 @@ +/* + * linux/fs/nfs/callback_proc.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback procedures + */ +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/slab.h> +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "internal.h" +#include "pnfs.h" + +#ifdef NFS_DEBUG +#define NFSDBG_FACILITY NFSDBG_CALLBACK +#endif + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps) +{ + struct nfs_delegation *delegation; + struct nfs_inode *nfsi; + struct inode *inode; + + res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + res->bitmap[0] = res->bitmap[1] = 0; + res->status = htonl(NFS4ERR_BADHANDLE); + + dprintk("NFS: GETATTR callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + nfsi = NFS_I(inode); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) + goto out_iput; + res->size = i_size_read(inode); + res->change_attr = delegation->change_attr; + if (nfsi->npages != 0) + res->change_attr++; + res->ctime = inode->i_ctime; + res->mtime = inode->i_mtime; + res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & + args->bitmap[0]; + res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & + args->bitmap[1]; + res->status = 0; +out_iput: + rcu_read_unlock(); + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); + return res->status; +} + +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct inode *inode; + __be32 res; + + res = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + dprintk("NFS: RECALL callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + res = htonl(NFS4ERR_BADHANDLE); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) + goto out; + /* Set up a helper thread to actually return the delegation */ + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; + case -ENOENT: + if (res != 0) + res = htonl(NFS4ERR_BAD_STATEID); + break; + default: + res = htonl(NFS4ERR_RESOURCE); + } + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); + return res; +} + +int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) + return 0; + return 1; +} + +#if defined(CONFIG_NFS_V4_1) + +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct nfs_server *server; + struct pnfs_layout_hdr *lo; + struct inode *ino; + bool found = false; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + if (found) + break; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (!found) + return NFS4ERR_NOMATCHING_LAYOUT; + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + mark_matching_lsegs_invalid(lo, &free_me_list, + &args->cbl_range)) + rv = NFS4ERR_DELAY; + else + rv = NFS4ERR_NOMATCHING_LAYOUT; + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct nfs_server *server; + struct pnfs_layout_hdr *lo; + struct inode *ino; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + struct pnfs_layout_hdr *tmp; + LIST_HEAD(recall_list); + LIST_HEAD(free_me_list); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if ((args->cbl_recall_type == RETURN_FSID) && + memcmp(&server->fsid, &args->cbl_fsid, + sizeof(struct nfs_fsid))) + continue; + + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(lo, tmp, + &recall_list, plh_bulk_recall) { + ino = lo->plh_inode; + spin_lock(&ino->i_lock); + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) + rv = NFS4ERR_DELAY; + list_del_init(&lo->plh_bulk_recall); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + put_layout_hdr(lo); + iput(ino); + } + return rv; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + u32 res = NFS4ERR_DELAY; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) + goto out; + if (args->cbl_recall_type == RETURN_FILE) + res = initiate_file_draining(clp, args); + else + res = initiate_bulk_draining(clp, args); + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); +out: + dprintk("%s returning %i\n", __func__, res); + return res; + +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps) +{ + u32 res; + + dprintk("%s: -->\n", __func__); + + if (cps->clp) + res = do_callback_layoutrecall(cps->clp, args); + else + res = NFS4ERR_OP_NOT_IN_SESSION; + + dprintk("%s: exit with status = %d\n", __func__, res); + return cpu_to_be32(res); +} + +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs args; + + /* Pretend we got a CB_LAYOUTRECALL(ALL) */ + memset(&args, 0, sizeof(args)); + args.cbl_recall_type = RETURN_ALL; + /* FIXME we ignore errors, what should we do? */ + do_callback_layoutrecall(clp, &args); +} + +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps) +{ + int i; + __be32 res = 0; + struct nfs_client *clp = cps->clp; + struct nfs_server *server = NULL; + + dprintk("%s: -->\n", __func__); + + if (!clp) { + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + goto out; + } + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + if (!server || + server->pnfs_curr_ld->id != dev->cbd_layout_type) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (server->pnfs_curr_ld && + server->pnfs_curr_ld->id == dev->cbd_layout_type) { + rcu_read_unlock(); + goto found; + } + rcu_read_unlock(); + dprintk("%s: layout type %u not found\n", + __func__, dev->cbd_layout_type); + continue; + } + + found: + if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) + dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " + "deleting instead\n", __func__); + nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + } + +out: + kfree(args->devs); + dprintk("%s: exit with status = %u\n", + __func__, be32_to_cpu(res)); + return res; +} + +int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL) + return 0; + + if (stateid->stateid.seqid != 0) + return 0; + if (memcmp(&delegation->stateid.stateid.other, + &stateid->stateid.other, + NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +} + +/* + * Validate the sequenceID sent by the server. + * Return success if the sequenceID is one more than what we last saw on + * this slot, accounting for wraparound. Increments the slot's sequence. + * + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now + * since we only currently implement idempotent callbacks anyway. + * + * We have a single slot backchannel at this time, so we don't bother + * checking the used_slots bit array on the table. The lower layer guarantees + * a single outstanding callback request at a time. + */ +static __be32 +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) +{ + struct nfs4_slot *slot; + + dprintk("%s enter. slotid %d seqid %d\n", + __func__, args->csa_slotid, args->csa_sequenceid); + + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) + return htonl(NFS4ERR_BADSLOT); + + slot = tbl->slots + args->csa_slotid; + dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); + + /* Normal */ + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { + slot->seq_nr++; + goto out_ok; + } + + /* Replay */ + if (args->csa_sequenceid == slot->seq_nr) { + dprintk("%s seqid %d is a replay\n", + __func__, args->csa_sequenceid); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + } + + /* Wraparound */ + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { + slot->seq_nr = 1; + goto out_ok; + } + + /* Misordered request */ + return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); +} + +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *tbl; + struct nfs_client *clp; + int i; + __be32 status = htonl(NFS4ERR_BADSESSION); + + clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); + if (clp == NULL) + goto out; + + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* state manager is resetting the session */ + if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); + goto out; + } + + status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + + cps->slotid = args->csa_slotid; + + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + +out: + cps->clp = clp; /* put in nfs4_callback_compound */ + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + cps->drc_status = status; + status = 0; + } else + res->csr_status = status; + + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; +} + +static bool +validate_bitmap_values(unsigned long mask) +{ + return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, + struct cb_process_state *cps) +{ + __be32 status; + fmode_t flags = 0; + + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values(args->craa_type_mask)) + goto out; + + status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + pnfs_recall_all_layouts(cps->clp); + if (flags) + nfs_expire_all_delegation_types(cps->clp, flags); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +/* Reduce the fore channel's max_slots to the target value */ +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *fc_tbl; + __be32 status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_max_slots); + + fc_tbl = &cps->clp->cl_session->fc_slot_table; + + status = htonl(NFS4ERR_BAD_HIGH_SLOT); + if (args->crsa_target_max_slots > fc_tbl->max_slots || + args->crsa_target_max_slots < 1) + goto out; + + status = htonl(NFS4_OK); + if (args->crsa_target_max_slots == fc_tbl->max_slots) + goto out; + + fc_tbl->target_max_slots = args->crsa_target_max_slots; + nfs41_handle_recall_slot(cps->clp); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c new file mode 100644 index 00000000000..2b2d33b33d0 --- /dev/null +++ b/fs/nfs/callback_xdr.c @@ -0,0 +1,997 @@ +/* + * linux/fs/nfs/callback_xdr.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFSv4 callback encode/decode procedures + */ +#include <linux/kernel.h> +#include <linux/sunrpc/svc.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/sunrpc/bc_xprt.h> +#include "nfs4_fs.h" +#include "callback.h" +#include "internal.h" + +#define CB_OP_TAGLEN_MAXSZ (512) +#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) +#define CB_OP_GETATTR_BITMAP_MAXSZ (4) +#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + CB_OP_GETATTR_BITMAP_MAXSZ + \ + 2 + 2 + 3 + 3) +#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + +#if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#endif /* CONFIG_NFS_V4_1 */ + +#define NFSDBG_FACILITY NFSDBG_CALLBACK + +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + +typedef __be32 (*callback_process_op_t)(void *, void *, + struct cb_process_state *); +typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); +typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); + + +struct callback_op { + callback_process_op_t process_op; + callback_decode_arg_t decode_args; + callback_encode_res_t encode_res; + long res_maxsize; +}; + +static struct callback_op callback_ops[]; + +static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return htonl(NFS4_OK); +} + +static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_argsize_check(rqstp, p); +} + +static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + +static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, nbytes); + if (unlikely(p == NULL)) + printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); + return p; +} + +static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *len = ntohl(*p); + + if (*len != 0) { + p = read_buf(xdr, *len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *str = (const char *)p; + } else + *str = NULL; + + return 0; +} + +static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + fh->size = ntohl(*p); + if (fh->size > NFS4_FHSIZE) + return htonl(NFS4ERR_BADHANDLE); + p = read_buf(xdr, fh->size); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(&fh->data[0], p, fh->size); + memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); + return 0; +} + +static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + __be32 *p; + unsigned int attrlen; + + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + attrlen = ntohl(*p); + p = read_buf(xdr, attrlen << 2); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + if (likely(attrlen > 0)) + bitmap[0] = ntohl(*p++); + if (attrlen > 1) + bitmap[1] = ntohl(*p); + return 0; +} + +static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + __be32 *p; + + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + memcpy(stateid->data, p, 16); + return 0; +} + +static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) +{ + __be32 *p; + __be32 status; + + status = decode_string(xdr, &hdr->taglen, &hdr->tag); + if (unlikely(status != 0)) + return status; + /* We do not like overly long tags! */ + if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { + printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", + __func__, hdr->taglen); + return htonl(NFS4ERR_RESOURCE); + } + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + hdr->minorversion = ntohl(*p++); + /* Check minor version is zero or one. */ + if (hdr->minorversion <= 1) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ + } else { + pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " + "illegal minor version %u!\n", + __func__, hdr->minorversion); + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } + hdr->nops = ntohl(*p); + dprintk("%s: minorversion %d nops %d\n", __func__, + hdr->minorversion, hdr->nops); + return 0; +} + +static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) +{ + __be32 *p; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *op = ntohl(*p); + return 0; +} + +static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->fh); + if (unlikely(status != 0)) + goto out; + args->addr = svc_addr(rqstp); + status = decode_bitmap(xdr, args->bitmap); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) +{ + __be32 *p; + __be32 status; + + args->addr = svc_addr(rqstp); + status = decode_stateid(xdr, &args->stateid); + if (unlikely(status != 0)) + goto out; + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_RESOURCE); + goto out; + } + args->truncate = ntohl(*p); + status = decode_fh(xdr, &args->fh); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + uint32_t iomode; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + /* Depite the spec's xdr, iomode really belongs in the FILE switch, + * as it is unusable and ignored with the other types. + */ + iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (args->cbl_recall_type == RETURN_FILE) { + args->cbl_range.iomode = iomode; + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_range.offset); + p = xdr_decode_hyper(p, &args->cbl_range.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } else if (args->cbl_recall_type != RETURN_ALL) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", + __func__, + args->cbl_layout_type, iomode, + args->cbl_layoutchanged, args->cbl_recall_type); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + if (n > ULONG_MAX / sizeof(*args->devs)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + if (!args->devs) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +err: + kfree(args->devs); + goto out; +} + +static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(sid->data, p, len); + return 0; +} + +static __be32 decode_rc_list(struct xdr_stream *xdr, + struct referring_call_list *rc_list) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &rc_list->rcl_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + rc_list->rcl_nrefcalls = ntohl(*p++); + if (rc_list->rcl_nrefcalls) { + p = read_buf(xdr, + rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * + sizeof(*rc_list->rcl_refcalls), + GFP_KERNEL); + if (unlikely(rc_list->rcl_refcalls == NULL)) + goto out; + for (i = 0; i < rc_list->rcl_nrefcalls; i++) { + rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++); + rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++); + } + } + status = 0; + +out: + return status; +} + +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_sequenceargs *args) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &args->csa_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, 5 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + args->csa_addr = svc_addr(rqstp); + args->csa_sequenceid = ntohl(*p++); + args->csa_slotid = ntohl(*p++); + args->csa_highestslotid = ntohl(*p++); + args->csa_cachethis = ntohl(*p++); + args->csa_nrclists = ntohl(*p++); + args->csa_rclists = NULL; + if (args->csa_nrclists) { + args->csa_rclists = kmalloc(args->csa_nrclists * + sizeof(*args->csa_rclists), + GFP_KERNEL); + if (unlikely(args->csa_rclists == NULL)) + goto out; + + for (i = 0; i < args->csa_nrclists; i++) { + status = decode_rc_list(xdr, &args->csa_rclists[i]); + if (status) + goto out_free; + } + } + status = 0; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " + "highestslotid %u cachethis %d nrclists %u\n", + __func__, + ((u32 *)&args->csa_sessionid)[0], + ((u32 *)&args->csa_sessionid)[1], + ((u32 *)&args->csa_sessionid)[2], + ((u32 *)&args->csa_sessionid)[3], + args->csa_sequenceid, args->csa_slotid, + args->csa_highestslotid, args->csa_cachethis, + args->csa_nrclists); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; + +out_free: + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + goto out; +} + +static __be32 decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + uint32_t bitmap[2]; + __be32 *p, status; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; + + return 0; +} + +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_max_slots = ntohl(*p++); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + xdr_encode_opaque(p, str, len); + return 0; +} + +#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) +#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +{ + __be32 bm[2]; + __be32 *p; + + bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); + bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); + if (bm[1] != 0) { + p = xdr_reserve_space(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(2); + *p++ = bm[0]; + *p++ = bm[1]; + } else if (bm[0] != 0) { + p = xdr_reserve_space(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(1); + *p++ = bm[0]; + } else { + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + *p++ = htonl(0); + } + *savep = p; + return 0; +} + +static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_CHANGE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, change); + return 0; +} + +static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size) +{ + __be32 *p; + + if (!(bitmap[0] & FATTR4_WORD0_SIZE)) + return 0; + p = xdr_reserve_space(xdr, 8); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, size); + return 0; +} + +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + if (unlikely(!p)) + return htonl(NFS4ERR_RESOURCE); + p = xdr_encode_hyper(p, time->tv_sec); + *p = htonl(time->tv_nsec); + return 0; +} + +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) +{ + __be32 status; + + hdr->status = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->status == NULL)) + return htonl(NFS4ERR_RESOURCE); + status = encode_string(xdr, hdr->taglen, hdr->tag); + if (unlikely(status != 0)) + return status; + hdr->nops = xdr_reserve_space(xdr, 4); + if (unlikely(hdr->nops == NULL)) + return htonl(NFS4ERR_RESOURCE); + return 0; +} + +static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE_HDR); + *p++ = htonl(op); + *p = res; + return 0; +} + +static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) +{ + __be32 *savep = NULL; + __be32 status = res->status; + + if (unlikely(status != 0)) + goto out; + status = encode_attr_bitmap(xdr, res->bitmap, &savep); + if (unlikely(status != 0)) + goto out; + status = encode_attr_change(xdr, res->bitmap, res->change_attr); + if (unlikely(status != 0)) + goto out; + status = encode_attr_size(xdr, res->bitmap, res->size); + if (unlikely(status != 0)) + goto out; + status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); + *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 encode_sessionid(struct xdr_stream *xdr, + const struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = xdr_reserve_space(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(p, sid, len); + return 0; +} + +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + const struct cb_sequenceres *res) +{ + __be32 *p; + unsigned status = res->csr_status; + + if (unlikely(status != 0)) + goto out; + + encode_sessionid(xdr, &res->csr_sessionid); + + p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + *p++ = htonl(res->csr_sequenceid); + *p++ = htonl(res->csr_slotid); + *p++ = htonl(res->csr_highestslotid); + *p++ = htonl(res->csr_target_highestslotid); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + if (op_nr == OP_CB_SEQUENCE) { + if (nop != 0) + return htonl(NFS4ERR_SEQUENCE_POS); + } else { + if (nop == 0) + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + } + + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: + case OP_CB_WANTS_CANCELLED: + case OP_CB_NOTIFY_LOCK: + return htonl(NFS4ERR_NOTSUPP); + + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl = &session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* + * Let the state manager know callback processing done. + * A single slot, so highest used slotid is either 0 or -1 + */ + tbl->highest_used_slotid = -1; + nfs4_check_drain_bc_complete(session); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ + if (cps->slotid != -1) + nfs4_callback_free_slot(cps->clp->cl_session); +} + +#else /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) +{ + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + *op = &callback_ops[op_nr]; + break; + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static __be32 process_op(uint32_t minorversion, int nop, + struct svc_rqst *rqstp, + struct xdr_stream *xdr_in, void *argp, + struct xdr_stream *xdr_out, void *resp, + struct cb_process_state *cps) +{ + struct callback_op *op = &callback_ops[0]; + unsigned int op_nr; + __be32 status; + long maxlen; + __be32 res; + + dprintk("%s: start\n", __func__); + status = decode_op_hdr(xdr_in, &op_nr); + if (unlikely(status)) + return status; + + dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", + __func__, minorversion, nop, op_nr); + + status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) : + preprocess_nfs4_op(op_nr, &op); + if (status == htonl(NFS4ERR_OP_ILLEGAL)) + op_nr = OP_CB_ILLEGAL; + if (status) + goto encode_hdr; + + if (cps->drc_status) { + status = cps->drc_status; + goto encode_hdr; + } + + maxlen = xdr_out->end - xdr_out->p; + if (maxlen > 0 && maxlen < PAGE_SIZE) { + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) + status = op->process_op(argp, resp, cps); + } else + status = htonl(NFS4ERR_RESOURCE); + +encode_hdr: + res = encode_op_hdr(xdr_out, op_nr, status); + if (unlikely(res)) + return res; + if (op->encode_res != NULL && status == 0) + status = op->encode_res(rqstp, xdr_out, resp); + dprintk("%s: done, status = %d\n", __func__, ntohl(status)); + return status; +} + +/* + * Decode, process and encode a COMPOUND + */ +static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) +{ + struct cb_compound_hdr_arg hdr_arg = { 0 }; + struct cb_compound_hdr_res hdr_res = { NULL }; + struct xdr_stream xdr_in, xdr_out; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, + .clp = NULL, + .slotid = -1, + }; + unsigned int nops = 0; + + dprintk("%s: start\n", __func__); + + xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + + p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + + status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + if (status == __constant_htonl(NFS4ERR_RESOURCE)) + return rpc_garbage_args; + + if (hdr_arg.minorversion == 0) { + cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + return rpc_drop_reply; + } + + hdr_res.taglen = hdr_arg.taglen; + hdr_res.tag = hdr_arg.tag; + if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + return rpc_system_err; + + while (status == 0 && nops != hdr_arg.nops) { + status = process_op(hdr_arg.minorversion, nops, rqstp, + &xdr_in, argp, &xdr_out, resp, &cps); + nops++; + } + + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + + *hdr_res.status = status; + *hdr_res.nops = htonl(nops); + nfs4_cb_free_slot(&cps); + nfs_put_client(cps.clp); + dprintk("%s: done, status = %u\n", __func__, ntohl(status)); + return rpc_success; +} + +/* + * Define NFS4 callback COMPOUND ops. + */ +static struct callback_op callback_ops[] = { + [0] = { + .res_maxsize = CB_OP_HDR_RES_MAXSZ, + }, + [OP_CB_GETATTR] = { + .process_op = (callback_process_op_t)nfs4_callback_getattr, + .decode_args = (callback_decode_arg_t)decode_getattr_args, + .encode_res = (callback_encode_res_t)encode_getattr_res, + .res_maxsize = CB_OP_GETATTR_RES_MAXSZ, + }, + [OP_CB_RECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_recall, + .decode_args = (callback_decode_arg_t)decode_recall_args, + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, +#if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, + .encode_res = (callback_encode_res_t)encode_cb_sequence_res, + .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, + }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, +#endif /* CONFIG_NFS_V4_1 */ +}; + +/* + * Define NFS4 callback procedures + */ +static struct svc_procedure nfs4_callback_procedures1[] = { + [CB_NULL] = { + .pc_func = nfs4_callback_null, + .pc_decode = (kxdrproc_t)nfs4_decode_void, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_xdrressize = 1, + }, + [CB_COMPOUND] = { + .pc_func = nfs4_callback_compound, + .pc_encode = (kxdrproc_t)nfs4_encode_void, + .pc_argsize = 256, + .pc_ressize = 256, + .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + } +}; + +struct svc_version nfs4_callback_version1 = { + .vs_vers = 1, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; + +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; diff --git a/fs/nfs/client.c b/fs/nfs/client.c new file mode 100644 index 00000000000..31778f74357 --- /dev/null +++ b/fs/nfs/client.c @@ -0,0 +1,2019 @@ +/* client.c: NFS client sharing and management code + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/metrics.h> +#include <linux/sunrpc/xprtsock.h> +#include <linux/sunrpc/xprtrdma.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/inet.h> +#include <linux/in6.h> +#include <linux/slab.h> +#include <net/ipv6.h> +#include <linux/nfs_xdr.h> +#include <linux/sunrpc/bc_xprt.h> + +#include <asm/system.h> + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +static DEFINE_SPINLOCK(nfs_client_lock); +static LIST_HEAD(nfs_client_list); +static LIST_HEAD(nfs_volume_list); +static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +#ifdef CONFIG_NFS_V4 +static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */ + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; +retry: + if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) + return -ENOMEM; + spin_lock(&nfs_client_lock); + ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); + spin_unlock(&nfs_client_lock); + if (ret == -EAGAIN) + goto retry; + return ret; +} +#endif /* CONFIG_NFS_V4 */ + +/* + * Turn off NFSv4 uid/gid mapping when using AUTH_SYS + */ +static bool nfs4_disable_idmapping = true; + +/* + * RPC cruft for NFS + */ +static struct rpc_version *nfs_version[5] = { + [2] = &nfs_version2, +#ifdef CONFIG_NFS_V3 + [3] = &nfs_version3, +#endif +#ifdef CONFIG_NFS_V4 + [4] = &nfs_version4, +#endif +}; + +struct rpc_program nfs_program = { + .name = "nfs", + .number = NFS_PROGRAM, + .nrvers = ARRAY_SIZE(nfs_version), + .version = nfs_version, + .stats = &nfs_rpcstat, + .pipe_dir_name = NFS_PIPE_DIRNAME, +}; + +struct rpc_stat nfs_rpcstat = { + .program = &nfs_program +}; + + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static struct rpc_version * nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; +#endif /* CONFIG_NFS_V3_ACL */ + +struct nfs_client_initdata { + const char *hostname; + const struct sockaddr *addr; + size_t addrlen; + const struct nfs_rpc_ops *rpc_ops; + int proto; + u32 minorversion; +}; + +/* + * Allocate a shared client record + * + * Since these are allocated/deallocated very rarely, we don't + * bother putting them in a slab cache... + */ +static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +{ + struct nfs_client *clp; + struct rpc_cred *cred; + int err = -ENOMEM; + + if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) + goto error_0; + + clp->rpc_ops = cl_init->rpc_ops; + + atomic_set(&clp->cl_count, 1); + clp->cl_cons_state = NFS_CS_INITING; + + memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); + clp->cl_addrlen = cl_init->addrlen; + + if (cl_init->hostname) { + err = -ENOMEM; + clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); + if (!clp->cl_hostname) + goto error_cleanup; + } + + INIT_LIST_HEAD(&clp->cl_superblocks); + clp->cl_rpcclient = ERR_PTR(-EINVAL); + + clp->cl_proto = cl_init->proto; + +#ifdef CONFIG_NFS_V4 + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error_cleanup; + + spin_lock_init(&clp->cl_lock); + INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; +#endif + cred = rpc_lookup_machine_cred("*"); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; + nfs_fscache_get_client_cookie(clp); + + return clp; + +error_cleanup: + kfree(clp); +error_0: + return ERR_PTR(err); +} + +#ifdef CONFIG_NFS_V4 +#ifdef CONFIG_NFS_V4_1 +static void nfs4_shutdown_session(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + nfs4_destroy_session(clp->cl_session); +} +#else /* CONFIG_NFS_V4_1 */ +static void nfs4_shutdown_session(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_mvops->minor_version); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + nfs4_shutdown_session(clp); + nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); +} + +/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ +void nfs_cleanup_cb_ident_idr(void) +{ + idr_destroy(&cb_ident_idr); +} + +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ + if (clp->cl_cb_ident) + idr_remove(&cb_ident_idr, clp->cl_cb_ident); +} + +static void pnfs_init_server(struct nfs_server *server) +{ + rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +} + +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs4_purge_state_owners(server); +} + +#else +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +} + +void nfs_cleanup_cb_ident_idr(void) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server) +{ +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Destroy a shared client record + */ +static void nfs_free_client(struct nfs_client *clp) +{ + dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); + + nfs4_shutdown_client(clp); + + nfs_fscache_release_client_cookie(clp); + + /* -EIO all pending I/O */ + if (!IS_ERR(clp->cl_rpcclient)) + rpc_shutdown_client(clp->cl_rpcclient); + + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + + nfs4_deviceid_purge_client(clp); + + kfree(clp->cl_hostname); + kfree(clp->server_scope); + kfree(clp); + + dprintk("<-- nfs_free_client()\n"); +} + +/* + * Release a reference to a shared client record + */ +void nfs_put_client(struct nfs_client *clp) +{ + if (!clp) + return; + + dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + + if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { + list_del(&clp->cl_share_link); + nfs_cb_idr_remove_locked(clp); + spin_unlock(&nfs_client_lock); + + BUG_ON(!list_empty(&clp->cl_superblocks)); + + nfs_free_client(clp); + } +} +EXPORT_SYMBOL_GPL(nfs_put_client); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) + return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; + + return 1; +} +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + return 0; +} +#endif + +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; + + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} + +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; + + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, excluding the port number. + */ +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6(sa1, sa2); + } + return 0; +} + +/* Common match routine for v4.0 and v4.1 callback services */ +bool +nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, + u32 minorversion) +{ + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; + + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; + + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; + + return true; +} + +/* + * Find an nfs_client on the list that matches the initialisation data + * that is supplied. + */ +static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) +{ + struct nfs_client *clp; + const struct sockaddr *sap = data->addr; + + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + /* Don't match clients that failed to initialise properly */ + if (clp->cl_cons_state < 0) + continue; + + /* Different NFS versions cannot share the same nfs_client */ + if (clp->rpc_ops != data->rpc_ops) + continue; + + if (clp->cl_proto != data->proto) + continue; + /* Match nfsv4 minorversion */ + if (clp->cl_minorversion != data->minorversion) + continue; + /* Match the full socket address */ + if (!nfs_sockaddr_cmp(sap, clap)) + continue; + + atomic_inc(&clp->cl_count); + return clp; + } + return NULL; +} + +/* + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist + */ +static struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) +{ + struct nfs_client *clp, *new = NULL; + int error; + + dprintk("--> nfs_get_client(%s,v%u)\n", + cl_init->hostname ?: "", cl_init->rpc_ops->version); + + /* see if the client already exists */ + do { + spin_lock(&nfs_client_lock); + + clp = nfs_match_client(cl_init); + if (clp) + goto found_client; + if (new) + goto install_client; + + spin_unlock(&nfs_client_lock); + + new = nfs_alloc_client(cl_init); + } while (!IS_ERR(new)); + + dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new)); + return new; + + /* install a new client and return with it unready */ +install_client: + clp = new; + list_add(&clp->cl_share_link, &nfs_client_list); + spin_unlock(&nfs_client_lock); + + error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, + authflavour, noresvport); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(error); + } + dprintk("--> nfs_get_client() = %p [new]\n", clp); + return clp; + + /* found an existing client + * - make sure it's ready before returning + */ +found_client: + spin_unlock(&nfs_client_lock); + + if (new) + nfs_free_client(new); + + error = wait_event_killable(nfs_client_active_wq, + clp->cl_cons_state < NFS_CS_INITING); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(-ERESTARTSYS); + } + + if (clp->cl_cons_state < NFS_CS_READY) { + error = clp->cl_cons_state; + nfs_put_client(clp); + return ERR_PTR(error); + } + + BUG_ON(clp->cl_cons_state != NFS_CS_READY); + + dprintk("--> nfs_get_client() = %p [share]\n", clp); + return clp; +} + +/* + * Mark a server as ready or failed + */ +void nfs_mark_client_ready(struct nfs_client *clp, int state) +{ + clp->cl_cons_state = state; + wake_up_all(&nfs_client_active_wq); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +int nfs4_check_client_ready(struct nfs_client *clp) +{ + if (!nfs4_has_session(clp)) + return 0; + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + return 0; +} + +/* + * Initialise the timeout values for a connection + */ +static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, + unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + + switch (proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; + if (to->to_initval == 0) + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) + to->to_maxval = NFS_MAX_TCP_TIMEOUT; + if (to->to_maxval < to->to_initval) + to->to_maxval = to->to_initval; + to->to_exponential = 0; + break; + case XPRT_TRANSPORT_UDP: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; + if (!to->to_initval) + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + default: + BUG(); + } +} + +/* + * Create an RPC client handle + */ +static int nfs_create_rpc_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + rpc_authflavor_t flavor, + int discrtry, int noresvport) +{ + struct rpc_clnt *clnt = NULL; + struct rpc_create_args args = { + .net = &init_net, + .protocol = clp->cl_proto, + .address = (struct sockaddr *)&clp->cl_addr, + .addrsize = clp->cl_addrlen, + .timeout = timeparms, + .servername = clp->cl_hostname, + .program = &nfs_program, + .version = clp->rpc_ops->version, + .authflavor = flavor, + }; + + if (discrtry) + args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + if (!IS_ERR(clp->cl_rpcclient)) + return 0; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %ld\n", + __func__, PTR_ERR(clnt)); + return PTR_ERR(clnt); + } + + clp->cl_rpcclient = clnt; + return 0; +} + +/* + * Version 2 or 3 client destruction + */ +static void nfs_destroy_server(struct nfs_server *server) +{ + if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || + !(server->flags & NFS_MOUNT_LOCAL_FCNTL)) + nlmclnt_done(server->nlm_host); +} + +/* + * Version 2 or 3 lockd setup + */ +static int nfs_start_lockd(struct nfs_server *server) +{ + struct nlm_host *host; + struct nfs_client *clp = server->nfs_client; + struct nlmclnt_initdata nlm_init = { + .hostname = clp->cl_hostname, + .address = (struct sockaddr *)&clp->cl_addr, + .addrlen = clp->cl_addrlen, + .nfs_version = clp->rpc_ops->version, + .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? + 1 : 0, + }; + + if (nlm_init.nfs_version > 3) + return 0; + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) + return 0; + + switch (clp->cl_proto) { + default: + nlm_init.protocol = IPPROTO_TCP; + break; + case XPRT_TRANSPORT_UDP: + nlm_init.protocol = IPPROTO_UDP; + } + + host = nlmclnt_init(&nlm_init); + if (IS_ERR(host)) + return PTR_ERR(host); + + server->nlm_host = host; + server->destroy = nfs_destroy_server; + return 0; +} + +/* + * Initialise an NFSv3 ACL client connection + */ +#ifdef CONFIG_NFS_V3_ACL +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->nfs_client->rpc_ops->version != 3) + goto out_noacl; + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +/* + * Create a general RPC client + */ +static int nfs_init_server_rpcclient(struct nfs_server *server, + const struct rpc_timeout *timeo, + rpc_authflavor_t pseudoflavour) +{ + struct nfs_client *clp = server->nfs_client; + + server->client = rpc_clone_client(clp->cl_rpcclient); + if (IS_ERR(server->client)) { + dprintk("%s: couldn't create rpc_client!\n", __func__); + return PTR_ERR(server->client); + } + + memcpy(&server->client->cl_timeout_default, + timeo, + sizeof(server->client->cl_timeout_default)); + server->client->cl_timeout = &server->client->cl_timeout_default; + + if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { + struct rpc_auth *auth; + + auth = rpcauth_create(pseudoflavour, server->client); + if (IS_ERR(auth)) { + dprintk("%s: couldn't create credcache!\n", __func__); + return PTR_ERR(auth); + } + } + server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + + return 0; +} + +/* + * Initialise an NFS2 or NFS3 client + */ +int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is already initialised */ + dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* + * Create a client RPC handle for doing FSSTAT with UNIX auth only + * - RFC 2623, sec 2.3.2 + */ + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, + 0, noresvport); + if (error < 0) + goto error; + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs_init_client() = xerror %d\n", error); + return error; +} + +/* + * Create a version 2 or 3 client + */ +static int nfs_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) +{ + struct nfs_client_initdata cl_init = { + .hostname = data->nfs_server.hostname, + .addr = (const struct sockaddr *)&data->nfs_server.address, + .addrlen = data->nfs_server.addrlen, + .rpc_ops = &nfs_v2_clientops, + .proto = data->nfs_server.protocol, + }; + struct rpc_timeout timeparms; + struct nfs_client *clp; + int error; + + dprintk("--> nfs_init_server()\n"); + +#ifdef CONFIG_NFS_V3 + if (data->version == 3) + cl_init.rpc_ops = &nfs_v3_clientops; +#endif + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, + data->flags & NFS_MOUNT_NORESVPORT); + if (IS_ERR(clp)) { + dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + return PTR_ERR(clp); + } + + server->nfs_client = clp; + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + /* Start lockd here, before we might error out */ + error = nfs_start_lockd(server); + if (error < 0) + goto error; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + if (error < 0) + goto error; + + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + + server->namelen = data->namlen; + /* Create a client RPC handle for the NFSv3 ACL management interface */ + nfs_init_server_aclclient(server); + dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); + return 0; + +error: + server->nfs_client = NULL; + nfs_put_client(clp); + dprintk("<-- nfs_init_server() = xerror %d\n", error); + return error; +} + +/* + * Load up the server record from information gained in an fsinfo record + */ +static void nfs_server_set_fsinfo(struct nfs_server *server, + struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) +{ + unsigned long max_rpc_payload; + + /* Work out a lot of parameters */ + if (server->rsize == 0) + server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + if (server->wsize == 0) + server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + + if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) + server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) + server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + + max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + if (server->rsize > max_rpc_payload) + server->rsize = max_rpc_payload; + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + server->backing_dev_info.name = "nfs"; + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); + + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + if (server->flags & NFS_MOUNT_NOAC) { + server->acregmin = server->acregmax = 0; + server->acdirmin = server->acdirmax = 0; + } + + server->maxfilesize = fsinfo->maxfilesize; + + server->time_delta = fsinfo->time_delta; + + /* We're airborne Set socket buffersize */ + rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); +} + +/* + * Probe filesystem information, including the FSID on v2/v3 + */ +static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +{ + struct nfs_fsinfo fsinfo; + struct nfs_client *clp = server->nfs_client; + int error; + + dprintk("--> nfs_probe_fsinfo()\n"); + + if (clp->rpc_ops->set_capabilities != NULL) { + error = clp->rpc_ops->set_capabilities(server, mntfh); + if (error < 0) + goto out_error; + } + + fsinfo.fattr = fattr; + fsinfo.layouttype = 0; + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; + + nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { + struct nfs_pathconf pathinfo; + + pathinfo.fattr = fattr; + nfs_fattr_init(fattr); + + if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) + server->namelen = pathinfo.max_namelen; + } + + dprintk("<-- nfs_probe_fsinfo() = 0\n"); + return 0; + +out_error: + dprintk("nfs_probe_fsinfo: error = %d\n", -error); + return error; +} + +/* + * Copy useful information when duplicating a server record + */ +static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +{ + target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; + target->acregmin = source->acregmin; + target->acregmax = source->acregmax; + target->acdirmin = source->acdirmin; + target->acdirmax = source->acdirmax; + target->caps = source->caps; + target->options = source->options; +} + +static void nfs_server_insert_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&nfs_client_lock); + list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + spin_unlock(&nfs_client_lock); + +} + +static void nfs_server_remove_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&nfs_client_lock); + list_del_rcu(&server->client_link); + if (clp && list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + list_del(&server->master_link); + spin_unlock(&nfs_client_lock); + + synchronize_rcu(); +} + +/* + * Allocate and initialise a server record + */ +static struct nfs_server *nfs_alloc_server(void) +{ + struct nfs_server *server; + + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + return NULL; + + server->client = server->client_acl = ERR_PTR(-EINVAL); + + /* Zero out the NFS state stuff */ + INIT_LIST_HEAD(&server->client_link); + INIT_LIST_HEAD(&server->master_link); + INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); + INIT_LIST_HEAD(&server->state_owners_lru); + + atomic_set(&server->active, 0); + + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + kfree(server); + return NULL; + } + + if (bdi_init(&server->backing_dev_info)) { + nfs_free_iostats(server->io_stats); + kfree(server); + return NULL; + } + + pnfs_init_server(server); + + return server; +} + +/* + * Free up a server record + */ +void nfs_free_server(struct nfs_server *server) +{ + dprintk("--> nfs_free_server()\n"); + + nfs_server_remove_lists(server); + unset_pnfs_layoutdriver(server); + + if (server->destroy != NULL) + server->destroy(server); + + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); + if (!IS_ERR(server->client)) + rpc_shutdown_client(server->client); + + nfs_put_client(server->nfs_client); + + nfs_free_iostats(server->io_stats); + bdi_destroy(&server->backing_dev_info); + kfree(server); + nfs_release_automount_timer(); + dprintk("<-- nfs_free_server()\n"); +} + +/* + * Create a version 2 or 3 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) +{ + struct nfs_server *server; + struct nfs_fattr *fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto error; + if (server->nfs_client->rpc_ops->version == 3) { + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + + if (!(fattr->valid & NFS_ATTR_FATTR)) { + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + nfs_free_fattr(fattr); + return server; + +error: + nfs_free_fattr(fattr); + nfs_free_server(server); + return ERR_PTR(error); +} + +#ifdef CONFIG_NFS_V4 +/* + * NFSv4.0 callback thread helper + * + * Find a client by IP address, protocol version, and minorversion + * + * Called from the pg_authenticate method. The callback identifier + * is not used as it has not been decoded. + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_no_ident(const struct sockaddr *addr) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 0) == false) + continue; + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(int cb_ident) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + clp = idr_find(&cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, 1) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nfs_client_lock); + return clp; + } + spin_unlock(&nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *addr, + struct nfs4_sessionid *sid) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ + int error; + + if (clp->rpc_ops->version == 4) { + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel( + clp->cl_rpcclient->cl_xprt, + NFS41_BC_MIN_CALLBACKS); + if (error < 0) + return error; + } + + error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + } + return 0; +} + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ +#if defined(CONFIG_NFS_V4_1) + if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; + } +#endif /* CONFIG_NFS_V4_1 */ + + return nfs4_init_callback(clp); +} + +/* + * Initialise an NFS4 client record + */ +int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + error = nfs_create_rpc_client(clp, timeparms, authflavour, + 1, noresvport); + if (error < 0) + goto error; + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + error = nfs4_init_client_minor_version(clp); + if (error < 0) + goto error; + + if (!nfs4_has_session(clp)) + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return error; +} + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion) +{ + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = addr, + .addrlen = addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = proto, + .minorversion = minorversion, + }; + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, + server->flags & NFS_MOUNT_NORESVPORT); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + }; + struct rpc_timeout ds_timeout = { + .to_initval = 15 * HZ, + .to_maxval = 15 * HZ, + .to_retries = 1, + .to_exponential = 1, + }; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server); + if (error < 0) + goto out; + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; +out: + nfs_free_fattr(fattr); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) +{ + struct rpc_timeout timeparms; + int error; + + dprintk("--> nfs4_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + server->options = data->options; + + /* Get a client record */ + error = nfs4_set_client(server, + data->nfs_server.hostname, + (const struct sockaddr *)&data->nfs_server.address, + data->nfs_server.addrlen, + data->client_address, + data->auth_flavors[0], + data->nfs_server.protocol, + &timeparms, + data->minorversion); + if (error < 0) + goto error; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + +error: + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh) +{ + struct nfs_server *server; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* set up the general RPC client */ + error = nfs4_init_server(server, data); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *mntfh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mntfh); + if (error < 0) + goto error; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Clone an NFS2, NFS3 or NFS4 server record + */ +struct nfs_server *nfs_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_fattr *fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", + (unsigned long long) fattr->fsid.major, + (unsigned long long) fattr->fsid.minor); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + error = -ENOMEM; + fattr_fsinfo = nfs_alloc_fattr(); + if (fattr_fsinfo == NULL) + goto out_free_server; + + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + server->destroy = source->destroy; + atomic_inc(&server->nfs_client->cl_count); + nfs_server_copy_userdata(server, source); + + server->fsid = fattr->fsid; + + error = nfs_init_server_rpcclient(server, + source->client->cl_timeout, + source->client->cl_auth->au_flavor); + if (error < 0) + goto out_free_server; + if (!IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ + error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + if (error < 0) + goto out_free_server; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + dprintk("Cloned FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + error = nfs_start_lockd(server); + if (error < 0) + goto out_free_server; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + + nfs_free_fattr(fattr_fsinfo); + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + +out_free_server: + nfs_free_fattr(fattr_fsinfo); + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_fs_nfs; + +static int nfs_server_list_open(struct inode *inode, struct file *file); +static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_server_list_stop(struct seq_file *p, void *v); +static int nfs_server_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_server_list_ops = { + .start = nfs_server_list_start, + .next = nfs_server_list_next, + .stop = nfs_server_list_stop, + .show = nfs_server_list_show, +}; + +static const struct file_operations nfs_server_list_fops = { + .open = nfs_server_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +static int nfs_volume_list_open(struct inode *inode, struct file *file); +static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_volume_list_stop(struct seq_file *p, void *v); +static int nfs_volume_list_show(struct seq_file *m, void *v); + +static const struct seq_operations nfs_volume_list_ops = { + .start = nfs_volume_list_start, + .next = nfs_volume_list_next, + .stop = nfs_volume_list_stop, + .show = nfs_volume_list_show, +}; + +static const struct file_operations nfs_volume_list_fops = { + .open = nfs_volume_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + +/* + * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which + * we're dealing + */ +static int nfs_server_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_server_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the server list and return the first item + */ +static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + return seq_list_start_head(&nfs_client_list, *_pos); +} + +/* + * move to next server + */ +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &nfs_client_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_server_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_server_list_show(struct seq_file *m, void *v) +{ + struct nfs_client *clp; + + /* display header on line 1 */ + if (v == &nfs_client_list) { + seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); + return 0; + } + + /* display one transport per line on subsequent lines */ + clp = list_entry(v, struct nfs_client, cl_share_link); + + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + + seq_printf(m, "v%u %s %s %3d %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + atomic_read(&clp->cl_count), + clp->cl_hostname); + + return 0; +} + +/* + * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes + */ +static int nfs_volume_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_volume_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the volume list and return the first item + */ +static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) +{ + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + return seq_list_start_head(&nfs_volume_list, *_pos); +} + +/* + * move to next volume + */ +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + return seq_list_next(v, &nfs_volume_list, pos); +} + +/* + * clean up after reading from the transports list + */ +static void nfs_volume_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_volume_list_show(struct seq_file *m, void *v) +{ + struct nfs_server *server; + struct nfs_client *clp; + char dev[8], fsid[17]; + + /* display header on line 1 */ + if (v == &nfs_volume_list) { + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + return 0; + } + /* display one transport per line on subsequent lines */ + server = list_entry(v, struct nfs_server, master_link); + clp = server->nfs_client; + + snprintf(dev, 8, "%u:%u", + MAJOR(server->s_dev), MINOR(server->s_dev)); + + snprintf(fsid, 17, "%llx:%llx", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + clp->rpc_ops->version, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), + dev, + fsid, + nfs_server_fscache_state(server)); + + return 0; +} + +/* + * initialise the /proc/fs/nfsfs/ directory + */ +int __init nfs_fs_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); + if (!proc_fs_nfs) + goto error_0; + + /* a file of servers with which we're dealing */ + p = proc_create("servers", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_server_list_fops); + if (!p) + goto error_1; + + /* a file of volumes that we have mounted */ + p = proc_create("volumes", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_volume_list_fops); + if (!p) + goto error_2; + return 0; + +error_2: + remove_proc_entry("servers", proc_fs_nfs); +error_1: + remove_proc_entry("fs/nfsfs", NULL); +error_0: + return -ENOMEM; +} + +/* + * clean up the /proc/fs/nfsfs/ directory + */ +void nfs_fs_proc_exit(void) +{ + remove_proc_entry("volumes", proc_fs_nfs); + remove_proc_entry("servers", proc_fs_nfs); + remove_proc_entry("fs/nfsfs", NULL); +} + +#endif /* CONFIG_PROC_FS */ + +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c new file mode 100644 index 00000000000..ac889af8ccf --- /dev/null +++ b/fs/nfs/delegation.c @@ -0,0 +1,716 @@ +/* + * linux/fs/nfs/delegation.c + * + * Copyright (C) 2004 Trond Myklebust + * + * NFS file delegation management + * + */ +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" + +static void nfs_free_delegation(struct nfs_delegation *delegation) +{ + if (delegation->cred) { + put_rpccred(delegation->cred); + delegation->cred = NULL; + } + kfree_rcu(delegation, rcu); +} + +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */ +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); +} + +/** + * nfs_have_delegation - check if inode has a delegation + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + struct nfs_delegation *delegation; + int ret = 0; + + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags) { + nfs_mark_delegation_referenced(delegation); + ret = 1; + } + rcu_read_unlock(); + return ret; +} + +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct file_lock *fl; + int status = 0; + + if (inode->i_flock == NULL) + goto out; + + /* Protect inode->i_flock using the file locks lock */ + lock_flocks(); + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) + continue; + if (nfs_file_open_context(fl->fl_file) != ctx) + continue; + unlock_flocks(); + status = nfs4_lock_delegation_recall(state, fl); + if (status < 0) + goto out; + lock_flocks(); + } + unlock_flocks(); +out: + return status; +} + +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + int err; + +again: + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); + err = nfs4_open_delegation_recall(ctx, state, stateid); + if (err >= 0) + err = nfs_delegation_claim_locks(ctx, state); + put_nfs_open_context(ctx); + if (err != 0) + return err; + goto again; + } + spin_unlock(&inode->i_lock); + return 0; +} + +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + * + */ +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + struct nfs_openres *res) +{ + struct nfs_delegation *delegation; + struct rpc_cred *oldcred = NULL; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { + memcpy(delegation->stateid.data, res->delegation.data, + sizeof(delegation->stateid.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; + delegation->cred = get_rpccred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags); + NFS_I(inode)->delegation_state = delegation->type; + spin_unlock(&delegation->lock); + put_rpccred(oldcred); + rcu_read_unlock(); + } else { + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); + } + } else { + rcu_read_unlock(); + } +} + +static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + int res = 0; + + res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); + nfs_free_delegation(delegation); + return res; +} + +static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation) +{ + struct inode *inode = NULL; + + spin_lock(&delegation->lock); + if (delegation->inode != NULL) + inode = igrab(delegation->inode); + spin_unlock(&delegation->lock); + return inode; +} + +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, + struct nfs_server *server) +{ + struct nfs_delegation *delegation = + rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&server->nfs_client->cl_lock)); + + if (delegation == NULL) + goto nomatch; + + spin_lock(&delegation->lock); + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; + nfsi->delegation_state = 0; + rcu_assign_pointer(nfsi->delegation, NULL); + spin_unlock(&delegation->lock); + return delegation; +nomatch: + return NULL; +} + +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, + struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, server); + spin_unlock(&clp->cl_lock); + return delegation; +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value. + */ +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation, *old_delegation; + struct nfs_delegation *freeme = NULL; + int status = 0; + + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; + memcpy(delegation->stateid.data, res->delegation.data, + sizeof(delegation->stateid.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = inode->i_version; + delegation->cred = get_rpccred(cred); + delegation->inode = inode; + delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + spin_lock_init(&delegation->lock); + + spin_lock(&clp->cl_lock); + old_delegation = rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + if (old_delegation != NULL) { + if (memcmp(&delegation->stateid, &old_delegation->stateid, + sizeof(old_delegation->stateid)) == 0 && + delegation->type == old_delegation->type) { + goto out; + } + /* + * Deal with broken servers that hand out two + * delegations for the same file. + */ + dfprintk(FILE, "%s: server %s handed out " + "a duplicate delegation!\n", + __func__, clp->cl_hostname); + if (delegation->type <= old_delegation->type) { + freeme = delegation; + delegation = NULL; + goto out; + } + freeme = nfs_detach_delegation_locked(nfsi, server); + } + list_add_rcu(&delegation->super_list, &server->delegations); + nfsi->delegation_state = delegation->type; + rcu_assign_pointer(nfsi->delegation, delegation); + delegation = NULL; + + /* Ensure we revalidate the attributes and page cache! */ + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + spin_unlock(&inode->i_lock); + +out: + spin_unlock(&clp->cl_lock); + if (delegation != NULL) + nfs_free_delegation(delegation); + if (freeme != NULL) + nfs_do_return_delegation(inode, freeme, 0); + return status; +} + +/* + * Basic procedure for returning a delegation to the server + */ +static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int err; + + /* + * Guard against new delegated open/lock/unlock calls and against + * state recovery + */ + down_write(&nfsi->rwsem); + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + up_write(&nfsi->rwsem); + if (err) + goto out; + + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Returns zero on success, or a negative errno value. + */ +int nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + int err = 0; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (!test_and_clear_bit(NFS_DELEGATION_RETURN, + &delegation->flags)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) { + filemap_flush(inode->i_mapping); + err = __nfs_inode_return_delegation(inode, + delegation, 0); + } + iput(inode); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; + } + } + rcu_read_unlock(); + return 0; +} + +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode(). + */ +void nfs_inode_return_delegation_noreclaim(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + + if (rcu_access_pointer(nfsi->delegation) != NULL) { + delegation = nfs_detach_delegation(nfsi, server); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); + } +} + +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * Returns zero on success, or a negative errno value. + */ +int nfs_inode_return_delegation(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int err = 0; + + if (rcu_access_pointer(nfsi->delegation) != NULL) { + delegation = nfs_detach_delegation(nfsi, server); + if (delegation != NULL) { + nfs_wb_all(inode); + err = __nfs_inode_return_delegation(inode, delegation, 1); + } + } + return err; +} + +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * + */ +void nfs_super_return_all_delegations(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + if (clp == NULL) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + } + rcu_read_unlock(); + + if (nfs_client_return_marked_delegations(clp) != 0) + nfs4_schedule_state_manager(clp); +} + +static void nfs_mark_return_all_delegation_types(struct nfs_server *server, + fmode_t flags) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) + continue; + if (delegation->type & flags) + nfs_mark_return_delegation(server, delegation); + } +} + +static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, + fmode_t flags) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_all_delegation_types(server, flags); + rcu_read_unlock(); +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +void nfs_remove_bad_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } +} + +/** + * nfs_expire_all_delegation_types + * @clp: client to process + * @flags: delegation types to expire + * + */ +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +{ + nfs_client_mark_return_all_delegation_types(clp, flags); + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + +/** + * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + */ +void nfs_handle_cb_pathdown(struct nfs_client *clp) +{ + if (clp == NULL) + return; + nfs_client_mark_return_all_delegations(clp); +} + +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) + continue; + nfs_mark_return_delegation(server, delegation); + } +} + +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */ +void nfs_expire_unreferenced_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unreferenced_delegations(server); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information from CB_RECALL arguments + * + * Returns zero on success, or a negative errno value. + */ +int nfs_async_inode_return_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } + nfs_mark_return_delegation(server, delegation); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); + return 0; +} + +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + struct nfs_delegation *delegation; + struct inode *res = NULL; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL && + nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { + res = igrab(delegation->inode); + } + spin_unlock(&delegation->lock); + if (res != NULL) + break; + } + return res; +} + +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. + */ +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, + const struct nfs_fh *fhandle) +{ + struct nfs_server *server; + struct inode *res = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + res = nfs_delegation_find_inode_server(server, fhandle); + if (res != NULL) + break; + } + rcu_read_unlock(); + return res; +} + +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + * + */ +void nfs_delegation_mark_reclaim(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_reclaim_server(server); + rcu_read_unlock(); +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + server); + rcu_read_unlock(); + + if (delegation != NULL) + nfs_free_delegation(delegation); + iput(inode); + goto restart; + } + } + rcu_read_unlock(); +} + +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ + struct nfs_server *server; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (!list_empty(&server->delegations)) { + ret = 1; + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * + * Returns one and fills in "dst->data" * if inode had a delegation, + * otherwise zero is returned. + */ +int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + int ret = 0; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { + memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); + ret = 1; + } + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h new file mode 100644 index 00000000000..691a7960918 --- /dev/null +++ b/fs/nfs/delegation.h @@ -0,0 +1,80 @@ +/* + * linux/fs/nfs/delegation.h + * + * Copyright (c) Trond Myklebust + * + * Definitions pertaining to NFS delegated files + */ +#ifndef FS_NFS_DELEGATION_H +#define FS_NFS_DELEGATION_H + +#if defined(CONFIG_NFS_V4) +/* + * NFSv4 delegation + */ +struct nfs_delegation { + struct list_head super_list; + struct rpc_cred *cred; + struct inode *inode; + nfs4_stateid stateid; + fmode_t type; + loff_t maxsize; + __u64 change_attr; + unsigned long flags; + spinlock_t lock; + struct rcu_head rcu; +}; + +enum { + NFS_DELEGATION_NEED_RECLAIM = 0, + NFS_DELEGATION_RETURN, + NFS_DELEGATION_REFERENCED, +}; + +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_return_delegation(struct inode *inode); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_inode_return_delegation_noreclaim(struct inode *inode); + +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +void nfs_super_return_all_delegations(struct super_block *sb); +void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unreferenced_delegations(struct nfs_client *clp); +void nfs_handle_cb_pathdown(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); + +void nfs_delegation_mark_reclaim(struct nfs_client *clp); +void nfs_delegation_reap_unclaimed(struct nfs_client *clp); + +/* NFSv4 delegation-related procedures */ +int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); +int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); +int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); + +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); +int nfs_have_delegation(struct inode *inode, fmode_t flags); + +#else +static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static inline int nfs_inode_return_delegation(struct inode *inode) +{ + return 0; +} +#endif + +static inline int nfs_have_delegated_attributes(struct inode *inode) +{ + return nfs_have_delegation(inode, FMODE_READ) && + !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); +} + +#endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c new file mode 100644 index 00000000000..fd9a872fada --- /dev/null +++ b/fs/nfs/dir.c @@ -0,0 +1,2349 @@ +/* + * linux/fs/nfs/dir.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs directory handling functions + * + * 10 Apr 1996 Added silly rename for unlink --okir + * 28 Sep 1996 Improved directory cache --okir + * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de + * Re-implemented silly rename for unlink, newly implemented + * silly rename for nfs_rename() following the suggestions + * of Olaf Kirch (okir) found in this file. + * Following Linus comments on my original hack, this version + * depends only on the dcache stuff and doesn't touch the inode + * layer (iput() and friends). + * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM + */ + +#include <linux/time.h> +#include <linux/errno.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/sched.h> +#include <linux/kmemleak.h> +#include <linux/xattr.h> + +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" + +/* #define NFS_DEBUG_VERBOSE 1 */ + +static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); +static int nfs_readdir(struct file *, void *, filldir_t); +static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); +static int nfs_mkdir(struct inode *, struct dentry *, umode_t); +static int nfs_rmdir(struct inode *, struct dentry *); +static int nfs_unlink(struct inode *, struct dentry *); +static int nfs_symlink(struct inode *, struct dentry *, const char *); +static int nfs_link(struct dentry *, struct inode *, struct dentry *); +static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int nfs_rename(struct inode *, struct dentry *, + struct inode *, struct dentry *); +static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); +static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static void nfs_readdir_clear_array(struct page*); + +const struct file_operations nfs_dir_operations = { + .llseek = nfs_llseek_dir, + .read = generic_read_dir, + .readdir = nfs_readdir, + .open = nfs_opendir, + .release = nfs_closedir, + .fsync = nfs_fsync_dir, +}; + +const struct inode_operations nfs_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, +}; + +#ifdef CONFIG_NFS_V3 +const struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_V3 */ + +#ifdef CONFIG_NFS_V4 + +static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); +const struct inode_operations nfs4_dir_inode_operations = { + .create = nfs_open_create, + .lookup = nfs_atomic_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, +}; + +#endif /* CONFIG_NFS_V4 */ + +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +{ + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->attr_gencount = NFS_I(dir)->attr_gencount; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + return ctx; + } + return ERR_PTR(-ENOMEM); +} + +static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +{ + put_rpccred(ctx->cred); + kfree(ctx); +} + +/* + * Open file + */ +static int +nfs_opendir(struct inode *inode, struct file *filp) +{ + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; + + dfprintk(FILE, "NFS: open dir(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; + if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { + /* This is a mountpoint, so d_revalidate will never + * have been called, so we need to refresh the + * inode (for close-open consistency) ourselves. + */ + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } +out: + put_rpccred(cred); + return res; +} + +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->private_data); + return 0; +} + +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; + unsigned char d_type; +}; + +struct nfs_cache_array { + unsigned int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); +typedef struct { + struct file *file; + struct page *page; + unsigned long page_index; + u64 *dir_cookie; + u64 last_cookie; + loff_t current_index; + decode_dirent_t decode; + + unsigned long timestamp; + unsigned long gencount; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; +} nfs_readdir_descriptor_t; + +/* + * The caller is responsible for calling nfs_readdir_release_array(page) + */ +static +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) +{ + void *ptr; + if (page == NULL) + return ERR_PTR(-EIO); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; +} + +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +void nfs_readdir_clear_array(struct page *page) +{ + struct nfs_cache_array *array; + int i; + + array = kmap_atomic(page, KM_USER0); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + kunmap_atomic(array, KM_USER0); +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + if (string->name == NULL) + return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); + string->hash = full_name_hash(name, len); + return 0; +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + + if (IS_ERR(array)) + return PTR_ERR(array); + + cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; + array->last_cookie = entry->cookie; + array->size++; + if (entry->eof != 0) + array->eof_index = array->size; +out: + nfs_readdir_release_array(page); + return ret; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->file->f_pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index >= 0) + goto out_eof; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + loff_t new_pos; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); + struct nfs_open_dir_context *ctx = desc->file->private_data; + + new_pos = desc->current_index + i; + if (ctx->attr_gencount != nfsi->attr_gencount + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->file->f_pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop." + "Please contact your server vendor. " + "The file: %s has duplicate cookie %llu\n", + desc->file->f_dentry->d_parent->d_name.name, + desc->file->f_dentry->d_name.name, + array->array[i].string.name, + *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = -1; + } + desc->file->f_pos = new_pos; + desc->cache_entry_index = i; + return 0; + } + } + if (array->eof_index >= 0) { + status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; + } +out: + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->page_index++; + } + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) +{ + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; + unsigned long timestamp, gencount; + int error; + + again: + timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + NFS_SERVER(inode)->dtsize, desc->plus); + if (error < 0) { + /* We requested READDIRPLUS, but the server doesn't grok it */ + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + desc->plus = 0; + goto again; + } + goto error; + } + desc->timestamp = timestamp; + desc->gencount = gencount; +error: + return error; +} + +static int xdr_decode(nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct xdr_stream *xdr) +{ + int error; + + error = desc->decode(xdr, entry, desc->plus); + if (error) + return error; + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; + return 0; +} + +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) +{ + if (dentry->d_inode == NULL) + goto different; + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) + goto different; + return 1; +different: + return 0; +} + +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +{ + struct qstr filename = { + .len = entry->len, + .name = entry->name, + }; + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct inode *inode; + + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + + dentry = d_lookup(parent, &filename); + if (dentry != NULL) { + if (nfs_same_file(dentry, entry)) { + nfs_refresh_inode(dentry->d_inode, entry->fattr); + goto out; + } else { + d_drop(dentry); + dput(dentry); + } + } + + dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + if (IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); +} + +/* Perform conversion from xdr to cache array */ +static +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + struct page **xdr_pages, struct page *page, unsigned int buflen) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct nfs_cache_array *array; + unsigned int count = 0; + int status; + + scratch = alloc_page(GFP_KERNEL); + if (scratch == NULL) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) { + if (status == -EAGAIN) + status = 0; + break; + } + + count++; + + if (desc->plus != 0) + nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; + } while (!entry->eof); + + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { + array = nfs_readdir_get_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); + } + + put_page(scratch); + return status; +} + +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + nfs_readdir_free_pagearray(pages, npages); +} + +/* + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page + */ +static +int nfs_readdir_large_page(struct page **pages, unsigned int npages) +{ + unsigned int i; + + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } + return 0; + +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return -ENOMEM; +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = -ENOMEM; + unsigned int array_size = ARRAY_SIZE(pages); + + entry.prev_cookie = 0; + entry.cookie = desc->last_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + entry.server = NFS_SERVER(inode); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; + + array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + + status = nfs_readdir_large_page(pages, array_size); + if (status < 0) + goto out_release_array; + do { + unsigned int pglen; + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + + if (status < 0) + break; + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); + + nfs_readdir_free_large_page(pages_ptr, pages, array_size); +out_release_array: + nfs_readdir_release_array(page); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); + return status; +} + +/* + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler + */ +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +{ + struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; + + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + SetPageUptodate(page); + + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { + /* Should never happen */ + nfs_zap_mapping(inode, inode->i_mapping); + } + unlock_page(page); + return 0; + error: + unlock_page(page); + return ret; +} + +static +void cache_page_release(nfs_readdir_descriptor_t *desc) +{ + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); + page_cache_release(desc->page); + desc->page = NULL; +} + +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +{ + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); +} + +/* + * Returns 0 if desc->dir_cookie was found on page desc->page_index + */ +static +int find_cache_page(nfs_readdir_descriptor_t *desc) +{ + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); + + res = nfs_readdir_search_array(desc); + if (res != 0) + cache_page_release(desc); + return res; +} + +/* Search for desc->dir_cookie from the beginning of the page cache */ +static inline +int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +{ + int res; + + if (desc->page_index == 0) { + desc->current_index = 0; + desc->last_cookie = 0; + } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); + return res; +} + +/* + * Once we've found the start of the dirent within a page: fill 'er up... + */ +static +int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, + filldir_t filldir) +{ + struct file *file = desc->file; + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } + + for (i = desc->cache_entry_index; i < array->size; i++) { + struct nfs_cache_array_entry *ent; + + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { + desc->eof = 1; + break; + } + file->f_pos++; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; + } + if (array->eof_index >= 0) + desc->eof = 1; + + nfs_readdir_release_array(desc->page); +out: + cache_page_release(desc); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", + (unsigned long long)*desc->dir_cookie, res); + return res; +} + +/* + * If we cannot find a cookie in our cache, we suspect that this is + * because it points to a deleted file, so we ask the server to return + * whatever it thinks is the next entry. We then feed this to filldir. + * If all goes well, we should then be able to find our way round the + * cache on the next call to readdir_search_pagecache(); + * + * NOTE: we cannot add the anonymous page to the pagecache because + * the data it contains might not be page aligned. Besides, + * we should already have a complete representation of the + * directory in the page cache by the time we get here. + */ +static inline +int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, + filldir_t filldir) +{ + struct page *page = NULL; + int status; + struct inode *inode = desc->file->f_path.dentry->d_inode; + struct nfs_open_dir_context *ctx = desc->file->private_data; + + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", + (unsigned long long)*desc->dir_cookie); + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + status = -ENOMEM; + goto out; + } + + desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; + desc->page = page; + ctx->duped = 0; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + + status = nfs_do_filldir(desc, dirent, filldir); + + out: + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", + __func__, status); + return status; + out_release: + cache_page_release(desc); + goto out; +} + +/* The file offset position represents the dirent entry number. A + last cookie cache takes care of the common case of reading the + whole directory. + */ +static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + nfs_readdir_descriptor_t my_desc, + *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = filp->private_data; + int res; + + dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (long long)filp->f_pos); + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + + /* + * filp->f_pos points to the dirent entry number. + * *desc->dir_cookie has the cookie for the next entry. We have + * to either find the entry with the appropriate number or + * revalidate the cookie. + */ + memset(desc, 0, sizeof(*desc)); + + desc->file = filp; + desc->dir_cookie = &dir_ctx->dir_cookie; + desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + nfs_block_sillyrename(dentry); + res = nfs_revalidate_mapping(inode, filp->f_mapping); + if (res < 0) + goto out; + + do { + res = readdir_search_pagecache(desc); + + if (res == -EBADCOOKIE) { + res = 0; + /* This means either end of directory */ + if (*desc->dir_cookie && desc->eof == 0) { + /* Or that the server has 'lost' a cookie */ + res = uncached_readdir(desc, dirent, filldir); + if (res == 0) + continue; + } + break; + } + if (res == -ETOOSMALL && desc->plus) { + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + nfs_zap_caches(inode); + desc->page_index = 0; + desc->plus = 0; + desc->eof = 0; + continue; + } + if (res < 0) + break; + + res = nfs_do_filldir(desc, dirent, filldir); + if (res < 0) + break; + } while (!desc->eof); +out: + nfs_unblock_sillyrename(dentry); + if (res > 0) + res = 0; + dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + res); + return res; +} + +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct nfs_open_dir_context *dir_ctx = filp->private_data; + + dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, + offset, origin); + + mutex_lock(&inode->i_mutex); + switch (origin) { + case 1: + offset += filp->f_pos; + case 0: + if (offset >= 0) + break; + default: + offset = -EINVAL; + goto out; + } + if (offset != filp->f_pos) { + filp->f_pos = offset; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +/* + * All directory operations under NFS are synchronous, so fsync() + * is a dummy operation. + */ +static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); + + mutex_lock(&inode->i_mutex); + nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); + mutex_unlock(&inode->i_mutex); + return 0; +} + +/** + * nfs_force_lookup_revalidate - Mark the directory as having changed + * @dir - pointer to directory inode + * + * This forces the revalidation code in nfs_lookup_revalidate() to do a + * full lookup on all child dentries of 'dir' whenever a change occurs + * on the server that might have invalidated our dcache. + * + * The caller should be holding dir->i_lock + */ +void nfs_force_lookup_revalidate(struct inode *dir) +{ + NFS_I(dir)->cache_change_attribute++; +} + +/* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + */ +static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +{ + if (IS_ROOT(dentry)) + return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + /* Revalidate nfsi->cache_change_attribute before we declare a match */ + if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) + return 0; + if (!nfs_verify_change_attribute(dir, dentry->d_time)) + return 0; + return 1; +} + +/* + * Return the intent data that applies to this particular path component + * + * Note that the current set of intents only apply to the very last + * component of the path and none of them is set before that last + * component. + */ +static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, + unsigned int mask) +{ + return nd->flags & mask; +} + +/* + * Use intent information to check whether or not we're going to do + * an O_EXCL create using this path component. + */ +static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) +{ + if (NFS_PROTO(dir)->version == 2) + return 0; + return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL); +} + +/* + * Inode and filehandle revalidation for lookups. + * + * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, + * or if the intent information indicates that we're about to open this + * particular file and the "nocto" mount flag is not set. + * + */ +static inline +int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) +{ + struct nfs_server *server = NFS_SERVER(inode); + + if (IS_AUTOMOUNT(inode)) + return 0; + if (nd != NULL) { + /* VFS wants an on-the-wire revalidation */ + if (nd->flags & LOOKUP_REVAL) + goto out_force; + /* This is an open(2) */ + if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && + !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || + S_ISDIR(inode->i_mode))) + goto out_force; + return 0; + } + return nfs_revalidate_inode(server, inode); +out_force: + return __nfs_revalidate_inode(server, inode); +} + +/* + * We judge how long we want to trust negative + * dentries by looking at the parent inode mtime. + * + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. + */ +static inline +int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + /* Don't revalidate a negative dentry if we're creating a new file */ + if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) + return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; + return !nfs_check_verifier(dir, dentry); +} + +/* + * This is called every time the dcache has a lookup hit, + * and we should check whether we can really trust that + * lookup. + * + * NOTE! The hit can be a negative hit too, don't assume + * we have an inode! + * + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. + */ +static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *dir; + struct inode *inode; + struct dentry *parent; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + int error; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + dir = parent->d_inode; + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); + inode = dentry->d_inode; + + if (!inode) { + if (nfs_neg_need_reval(dir, dentry, nd)) + goto out_bad; + goto out_valid; + } + + if (is_bad_inode(inode)) { + dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + goto out_bad; + } + + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_set_verifier; + + /* Force a full look up iff the parent directory has changed */ + if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, nd)) + goto out_zap_parent; + goto out_valid; + } + + if (NFS_STALE(inode)) + goto out_bad; + + error = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out_error; + + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_bad; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out_bad; + if ((error = nfs_refresh_inode(inode, fattr)) != 0) + goto out_bad; + + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); +out_set_verifier: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + out_valid: + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 1; +out_zap_parent: + nfs_zap_caches(dir); + out_bad: + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + if (dentry->d_flags & DCACHE_DISCONNECTED) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name); + return 0; +out_error: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name, error); + return error; +} + +/* + * This is called from dput() when d_count is going to 0. + */ +static int nfs_dentry_delete(const struct dentry *dentry) +{ + dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + dentry->d_flags); + + /* Unhash any dentry with a stale inode */ + if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) + return 1; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + /* Unhash it, so that ->d_iput() would be called */ + return 1; + } + if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + /* Unhash it, so that ancestors of killed async unlink + * files will be cleaned up during umount */ + return 1; + } + return 0; + +} + +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&inode->i_lock); +} + +/* + * Called when the dentry loses inode. + * We use it to clean up silly-renamed files. + */ +static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + /* drop any readdir cache as it could easily be old */ + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + drop_nlink(inode); + nfs_complete_unlink(dentry, inode); + } + iput(inode); +} + +static void nfs_d_release(struct dentry *dentry) +{ + /* free cached devname value, if it survived that far */ + if (unlikely(dentry->d_fsdata)) { + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + WARN_ON(1); + else + kfree(dentry->d_fsdata); + } +} + +const struct dentry_operations nfs_dentry_operations = { + .d_revalidate = nfs_lookup_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; + +static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *res; + struct dentry *parent; + struct inode *inode = NULL; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + int error; + + dfprintk(VFS, "NFS: lookup(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); + + res = ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + goto out; + + /* + * If we're doing an exclusive create, optimize away the lookup + * but don't hash the dentry. + */ + if (nfs_is_exclusive_create(dir, nd)) { + d_instantiate(dentry, NULL); + res = NULL; + goto out; + } + + res = ERR_PTR(-ENOMEM); + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out; + + parent = dentry->d_parent; + /* Protect against concurrent sillydeletes */ + nfs_block_sillyrename(parent); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unblock_sillyrename; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + res = ERR_CAST(inode); + if (IS_ERR(res)) + goto out_unblock_sillyrename; + +no_entry: + res = d_materialise_unique(dentry, inode); + if (res != NULL) { + if (IS_ERR(res)) + goto out_unblock_sillyrename; + dentry = res; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +out_unblock_sillyrename: + nfs_unblock_sillyrename(parent); +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + return res; +} + +#ifdef CONFIG_NFS_V4 +static int nfs_open_revalidate(struct dentry *, struct nameidata *); + +const struct dentry_operations nfs4_dentry_operations = { + .d_revalidate = nfs_open_revalidate, + .d_delete = nfs_dentry_delete, + .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, +}; + +/* + * Use intent information to determine whether we need to substitute + * the NFSv4-style stateful OPEN for the LOOKUP call + */ +static int is_atomic_open(struct nameidata *nd) +{ + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) + return 0; + /* NFS does not (yet) have a stateful open for directories */ + if (nd->flags & LOOKUP_DIRECTORY) + return 0; + /* Are we trying to write to a read only partition? */ + if (__mnt_is_readonly(nd->path.mnt) && + (nd->intent.open.flags & (O_CREAT|O_TRUNC|O_ACCMODE))) + return 0; + return 1; +} + +static fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) +{ + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); +} + +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +{ + struct file *filp; + int ret = 0; + + /* If the open_intent is for execute, we have an extra check to make */ + if (ctx->mode & FMODE_EXEC) { + ret = nfs_may_open(ctx->dentry->d_inode, + ctx->cred, + nd->intent.open.flags); + if (ret < 0) + goto out; + } + filp = lookup_instantiate_filp(nd, ctx->dentry, do_open); + if (IS_ERR(filp)) + ret = PTR_ERR(filp); + else + nfs_file_set_open_context(filp, ctx); +out: + put_nfs_open_context(ctx); + return ret; +} + +static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct nfs_open_context *ctx; + struct iattr attr; + struct dentry *res = NULL; + struct inode *inode; + int open_flags; + int err; + + dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + /* Check that we are indeed trying to open this file */ + if (!is_atomic_open(nd)) + goto no_open; + + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { + res = ERR_PTR(-ENAMETOOLONG); + goto out; + } + + /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash + * the dentry. */ + if (nd->flags & LOOKUP_EXCL) { + d_instantiate(dentry, NULL); + goto out; + } + + open_flags = nd->intent.open.flags; + + ctx = create_nfs_open_context(dentry, open_flags); + res = ERR_CAST(ctx); + if (IS_ERR(ctx)) + goto out; + + if (nd->flags & LOOKUP_CREATE) { + attr.ia_mode = nd->intent.open.create_mode; + attr.ia_valid = ATTR_MODE; + attr.ia_mode &= ~current_umask(); + } else { + open_flags &= ~(O_EXCL | O_CREAT); + attr.ia_valid = 0; + } + + /* Open the file on the server */ + nfs_block_sillyrename(dentry->d_parent); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + if (IS_ERR(inode)) { + nfs_unblock_sillyrename(dentry->d_parent); + put_nfs_open_context(ctx); + switch (PTR_ERR(inode)) { + /* Make a negative dentry */ + case -ENOENT: + d_add(dentry, NULL); + res = NULL; + goto out; + /* This turned out not to be a regular file */ + case -EISDIR: + case -ENOTDIR: + goto no_open; + case -ELOOP: + if (!(nd->intent.open.flags & O_NOFOLLOW)) + goto no_open; + /* case -EINVAL: */ + default: + res = ERR_CAST(inode); + goto out; + } + } + res = d_add_unique(dentry, inode); + nfs_unblock_sillyrename(dentry->d_parent); + if (res != NULL) { + dput(ctx->dentry); + ctx->dentry = dget(res); + dentry = res; + } + err = nfs_intent_set_file(nd, ctx); + if (err < 0) { + if (res != NULL) + dput(res); + return ERR_PTR(err); + } +out: + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return res; +no_open: + return nfs_lookup(dir, dentry, nd); +} + +static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent = NULL; + struct inode *inode; + struct inode *dir; + struct nfs_open_context *ctx; + int openflags, ret = 0; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if (!is_atomic_open(nd) || d_mountpoint(dentry)) + goto no_open; + + parent = dget_parent(dentry); + dir = parent->d_inode; + + /* We can't create new files in nfs_open_revalidate(), so we + * optimize away revalidation of negative dentries. + */ + if (inode == NULL) { + if (!nfs_neg_need_reval(dir, dentry, nd)) + ret = 1; + goto out; + } + + /* NFS only supports OPEN on regular files */ + if (!S_ISREG(inode->i_mode)) + goto no_open_dput; + openflags = nd->intent.open.flags; + /* We cannot do exclusive creation on a positive dentry */ + if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + goto no_open_dput; + /* We can't create new files, or truncate existing ones here */ + openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); + + ctx = create_nfs_open_context(dentry, openflags); + ret = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + /* + * Note: we're not holding inode->i_mutex and so may be racing with + * operations that change the directory. We therefore save the + * change attribute *before* we do the RPC call. + */ + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + switch (ret) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + ret = nfs_intent_set_file(nd, ctx); + if (ret >= 0) + ret = 1; +out: + dput(parent); + return ret; +out_drop: + d_drop(dentry); + ret = 0; +out_put_ctx: + put_nfs_open_context(ctx); + goto out; + +no_open_dput: + dput(parent); +no_open: + return nfs_lookup_revalidate(dentry, nd); +} + +static int nfs_open_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) +{ + struct nfs_open_context *ctx = NULL; + struct iattr attr; + int error; + int open_flags = O_CREAT|O_EXCL; + + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + if (nd) + open_flags = nd->intent.open.flags; + + ctx = create_nfs_open_context(dentry, open_flags); + error = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out_err_drop; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); + if (error != 0) + goto out_put_ctx; + if (nd) { + error = nfs_intent_set_file(nd, ctx); + if (error < 0) + goto out_err; + } else { + put_nfs_open_context(ctx); + } + return 0; +out_put_ctx: + put_nfs_open_context(ctx); +out_err_drop: + d_drop(dentry); +out_err: + return error; +} + +#endif /* CONFIG_NFSV4 */ + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct dentry *parent = dget_parent(dentry); + struct inode *dir = parent->d_inode; + struct inode *inode; + int error = -EACCES; + + d_drop(dentry); + + /* We may have been initialized further down */ + if (dentry->d_inode) + goto out; + if (fhandle->size == 0) { + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + if (error) + goto out_error; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (!(fattr->valid & NFS_ATTR_FATTR)) { + struct nfs_server *server = NFS_SB(dentry->d_sb); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); + if (error < 0) + goto out_error; + } + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + error = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_error; + d_add(dentry, inode); +out: + dput(parent); + return 0; +out_error: + nfs_mark_for_revalidate(dir); + dput(parent); + return error; +} + +/* + * Following a failed create operation, we drop the dentry rather + * than retain a negative dentry. This avoids a problem in the event + * that the operation succeeded on the server, but an error in the + * reply path made it appear to have failed. + */ +static int nfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) +{ + struct iattr attr; + int error; + int open_flags = O_CREAT|O_EXCL; + + dfprintk(VFS, "NFS: create(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + if (nd) + open_flags = nd->intent.open.flags; + + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +static int +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct iattr attr; + int status; + + dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + attr.ia_mode = mode; + attr.ia_valid = ATTR_MODE; + + status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); + if (status != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return status; +} + +/* + * See comments for nfs_proc_create regarding failed operations. + */ +static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct iattr attr; + int error; + + dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + attr.ia_valid = ATTR_MODE; + attr.ia_mode = mode | S_IFDIR; + + error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + if (error != 0) + goto out_err; + return 0; +out_err: + d_drop(dentry); + return error; +} + +static void nfs_dentry_handle_enoent(struct dentry *dentry) +{ + if (dentry->d_inode != NULL && !d_unhashed(dentry)) + d_delete(dentry); +} + +static int nfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", + dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + /* Ensure the VFS deletes this inode */ + if (error == 0 && dentry->d_inode != NULL) + clear_nlink(dentry->d_inode); + else if (error == -ENOENT) + nfs_dentry_handle_enoent(dentry); + + return error; +} + +/* + * Remove a file after making sure there are no pending writes, + * and after checking that the file has only one user. + * + * We invalidate the attribute cache and free the inode prior to the operation + * to avoid possible races if the server reuses the inode. + */ +static int nfs_safe_remove(struct dentry *dentry) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = dentry->d_inode; + int error = -EBUSY; + + dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* If the dentry was sillyrenamed, we simply call d_delete() */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + error = 0; + goto out; + } + + if (inode != NULL) { + nfs_inode_return_delegation(inode); + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + /* The VFS may want to delete this inode */ + if (error == 0) + nfs_drop_nlink(inode); + nfs_mark_for_revalidate(inode); + } else + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + if (error == -ENOENT) + nfs_dentry_handle_enoent(dentry); +out: + return error; +} + +/* We do silly rename. In case sillyrename() returns -EBUSY, the inode + * belongs to an active ".nfs..." file and we return -EBUSY. + * + * If sillyrename() returns 0, we do nothing, otherwise we unlink. + */ +static int nfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + int need_rehash = 0; + + dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry->d_name.name); + + spin_lock(&dentry->d_lock); + if (dentry->d_count > 1) { + spin_unlock(&dentry->d_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(dentry->d_inode, 0); + error = nfs_sillyrename(dir, dentry); + return error; + } + if (!d_unhashed(dentry)) { + __d_drop(dentry); + need_rehash = 1; + } + spin_unlock(&dentry->d_lock); + error = nfs_safe_remove(dentry); + if (!error || error == -ENOENT) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + } else if (need_rehash) + d_rehash(dentry); + return error; +} + +/* + * To create a symbolic link, most file systems instantiate a new inode, + * add a page to it containing the path, then write it out to the disk + * using prepare_write/commit_write. + * + * Unfortunately the NFS client can't create the in-core inode first + * because it needs a file handle to create an in-core inode (see + * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the + * symlink request has completed on the server. + * + * So instead we allocate a raw page, copy the symname into it, then do + * the SYMLINK request with the page as the buffer. If it succeeds, we + * now have a new file handle and can instantiate an in-core NFS inode + * and move the raw page into its mapping. + */ +static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct pagevec lru_pvec; + struct page *page; + char *kaddr; + struct iattr attr; + unsigned int pathlen = strlen(symname); + int error; + + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry->d_name.name, symname); + + if (pathlen > PAGE_SIZE) + return -ENAMETOOLONG; + + attr.ia_mode = S_IFLNK | S_IRWXUGO; + attr.ia_valid = ATTR_MODE; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, pathlen); + if (pathlen < PAGE_SIZE) + memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); + kunmap_atomic(kaddr, KM_USER0); + + error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + if (error != 0) { + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", + dir->i_sb->s_id, dir->i_ino, + dentry->d_name.name, symname, error); + d_drop(dentry); + __free_page(page); + return error; + } + + /* + * No big deal if we can't add this page to the page cache here. + * READLINK will get the missing page from the server if needed. + */ + pagevec_init(&lru_pvec, 0); + if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + GFP_KERNEL)) { + pagevec_add(&lru_pvec, page); + pagevec_lru_add_file(&lru_pvec); + SetPageUptodate(page); + unlock_page(page); + } else + __free_page(page); + + return 0; +} + +static int +nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", + old_dentry->d_parent->d_name.name, old_dentry->d_name.name, + dentry->d_parent->d_name.name, dentry->d_name.name); + + nfs_inode_return_delegation(inode); + + d_drop(dentry); + error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + if (error == 0) { + ihold(inode); + d_add(dentry, inode); + } + return error; +} + +/* + * RENAME + * FIXME: Some nfsds, like the Linux user space nfsd, may generate a + * different file handle for the same inode after a rename (e.g. when + * moving to a different directory). A fail-safe method to do so would + * be to look up old_dir/old_name, create a link to new_dir/new_name and + * rename the old file using the sillyrename stuff. This way, the original + * file in old_dir will go away when the last process iput()s the inode. + * + * FIXED. + * + * It actually works quite well. One needs to have the possibility for + * at least one ".nfs..." file in each directory the file ever gets + * moved or linked to which happens automagically with the new + * implementation that only depends on the dcache stuff instead of + * using the inode layer + * + * Unfortunately, things are a little more complicated than indicated + * above. For a cross-directory move, we want to make sure we can get + * rid of the old inode after the operation. This means there must be + * no pending writes (if it's a file), and the use count must be 1. + * If these conditions are met, we can drop the dentries before doing + * the rename. + */ +static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct dentry *dentry = NULL, *rehash = NULL; + int error = -EBUSY; + + dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", + old_dentry->d_parent->d_name.name, old_dentry->d_name.name, + new_dentry->d_parent->d_name.name, new_dentry->d_name.name, + new_dentry->d_count); + + /* + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. + */ + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } + + if (new_dentry->d_count > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; + + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; + + new_dentry = dentry; + rehash = NULL; + new_inode = NULL; + } + } + + nfs_inode_return_delegation(old_inode); + if (new_inode != NULL) + nfs_inode_return_delegation(new_inode); + + error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, + new_dir, &new_dentry->d_name); + nfs_mark_for_revalidate(old_inode); +out: + if (rehash) + d_rehash(rehash); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + + /* new dentry created? */ + if (dentry) + dput(dentry); + return error; +} + +static DEFINE_SPINLOCK(nfs_access_lru_lock); +static LIST_HEAD(nfs_access_lru_list); +static atomic_long_t nfs_access_nr_entries; + +static void nfs_access_free_entry(struct nfs_access_entry *entry) +{ + put_rpccred(entry->cred); + kfree(entry); + smp_mb__before_atomic_dec(); + atomic_long_dec(&nfs_access_nr_entries); + smp_mb__after_atomic_dec(); +} + +static void nfs_access_free_list(struct list_head *head) +{ + struct nfs_access_entry *cache; + + while (!list_empty(head)) { + cache = list_entry(head->next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } +} + +int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(head); + struct nfs_inode *nfsi, *next; + struct nfs_access_entry *cache; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return (nr_to_scan == 0) ? 0 : -1; + + spin_lock(&nfs_access_lru_lock); + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { + struct inode *inode; + + if (nr_to_scan-- == 0) + break; + inode = &nfsi->vfs_inode; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; + cache = list_entry(nfsi->access_cache_entry_lru.next, + struct nfs_access_entry, lru); + list_move(&cache->lru, &head); + rb_erase(&cache->rb_node, &nfsi->access_cache); + if (!list_empty(&nfsi->access_cache_entry_lru)) + list_move_tail(&nfsi->access_cache_inode_lru, + &nfs_access_lru_list); + else { +remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + smp_mb__after_clear_bit(); + } + spin_unlock(&inode->i_lock); + } + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; +} + +static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) +{ + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node *n; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); + list_move(&entry->lru, head); + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; +} + +void nfs_access_zap_cache(struct inode *inode) +{ + LIST_HEAD(head); + + if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) + return; + /* Remove from global LRU init */ + spin_lock(&nfs_access_lru_lock); + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_del_init(&NFS_I(inode)->access_cache_inode_lru); + + spin_lock(&inode->i_lock); + __nfs_access_zap_cache(NFS_I(inode), &head); + spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); +} + +static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +{ + struct rb_node *n = NFS_I(inode)->access_cache.rb_node; + struct nfs_access_entry *entry; + + while (n != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + + if (cred < entry->cred) + n = n->rb_left; + else if (cred > entry->cred) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache; + int err = -ENOENT; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + if (cache == NULL) + goto out; + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + goto out_stale; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); + err = 0; +out: + spin_unlock(&inode->i_lock); + return err; +out_stale: + rb_erase(&cache->rb_node, &nfsi->access_cache); + list_del(&cache->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(cache); + return -ENOENT; +out_zap: + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + return -ENOENT; +} + +static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node **p = &root_node->rb_node; + struct rb_node *parent = NULL; + struct nfs_access_entry *entry; + + spin_lock(&inode->i_lock); + while (*p != NULL) { + parent = *p; + entry = rb_entry(parent, struct nfs_access_entry, rb_node); + + if (set->cred < entry->cred) + p = &parent->rb_left; + else if (set->cred > entry->cred) + p = &parent->rb_right; + else + goto found; + } + rb_link_node(&set->rb_node, parent, p); + rb_insert_color(&set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + spin_unlock(&inode->i_lock); + return; +found: + rb_replace_node(parent, &set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + list_del(&entry->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(entry); +} + +static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); + if (cache == NULL) + return; + RB_CLEAR_NODE(&cache->rb_node); + cache->jiffies = set->jiffies; + cache->cred = get_rpccred(set->cred); + cache->mask = set->mask; + + nfs_access_add_rbtree(inode, cache); + + /* Update accounting */ + smp_mb__before_atomic_inc(); + atomic_long_inc(&nfs_access_nr_entries); + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ + if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + spin_lock(&nfs_access_lru_lock); + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, + &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } +} + +static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) +{ + struct nfs_access_entry cache; + int status; + + status = nfs_access_get_cached(inode, cred, &cache); + if (status == 0) + goto out; + + /* Be clever: ask server to check for all possible rights */ + cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.cred = cred; + cache.jiffies = jiffies; + status = NFS_PROTO(inode)->access(inode, &cache); + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + return status; + } + nfs_access_add_cache(inode, &cache); +out: + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int nfs_open_permission_mask(int openflags) +{ + int mask = 0; + + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + if (openflags & __FMODE_EXEC) + mask |= MAY_EXEC; + return mask; +} + +int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) +{ + return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); +} + +int nfs_permission(struct inode *inode, int mask) +{ + struct rpc_cred *cred; + int res = 0; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + nfs_inc_stats(inode, NFSIOS_VFSACCESS); + + if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + goto out; + /* Is this sys_access() ? */ + if (mask & (MAY_ACCESS | MAY_CHDIR)) + goto force_lookup; + + switch (inode->i_mode & S_IFMT) { + case S_IFLNK: + goto out; + case S_IFREG: + /* NFSv4 has atomic_open... */ + if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) + && (mask & MAY_OPEN) + && !(mask & MAY_EXEC)) + goto out; + break; + case S_IFDIR: + /* + * Optimize away all write operations, since the server + * will check permissions when we perform the op. + */ + if ((mask & MAY_WRITE) && !(mask & MAY_READ)) + goto out; + } + +force_lookup: + if (!NFS_PROTO(inode)->access) + goto out_notsup; + + cred = rpc_lookup_cred(); + if (!IS_ERR(cred)) { + res = nfs_do_access(inode, cred, mask); + put_rpccred(cred); + } else + res = PTR_ERR(cred); +out: + if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) + res = -EACCES; + + dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + inode->i_sb->s_id, inode->i_ino, mask, res); + return res; +out_notsup: + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (res == 0) + res = generic_permission(inode, mask); + goto out; +} + +/* + * Local variables: + * version-control: t + * kept-new-versions: 5 + * End: + */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c new file mode 100644 index 00000000000..1940f1a56a5 --- /dev/null +++ b/fs/nfs/direct.c @@ -0,0 +1,1039 @@ +/* + * linux/fs/nfs/direct.c + * + * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> + * + * High-performance uncached I/O for the Linux NFS client + * + * There are important applications whose performance or correctness + * depends on uncached access to file data. Database clusters + * (multiple copies of the same instance running on separate hosts) + * implement their own cache coherency protocol that subsumes file + * system cache protocols. Applications that process datasets + * considerably larger than the client's memory do not always benefit + * from a local cache. A streaming video server, for instance, has no + * need to cache the contents of a file. + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. + * + * 18 Dec 2001 Initial implementation for 2.4 --cel + * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy + * 08 Jun 2003 Port to 2.5 APIs --cel + * 31 Mar 2004 Handle direct I/O without VFS support --cel + * 15 Sep 2004 Parallel async reads --cel + * 04 May 2005 support O_DIRECT with aio --cel + * + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/file.h> +#include <linux/pagemap.h> +#include <linux/kref.h> +#include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/sunrpc/clnt.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/atomic.h> + +#include "internal.h" +#include "iostat.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static struct kmem_cache *nfs_direct_cachep; + +/* + * This represents a set of asynchronous requests that we're waiting on + */ +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + ssize_t count, /* bytes actually processed */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct list_head rewrite_list; /* saved nfs_write_data structs */ + struct nfs_write_data * commit_data; /* special write_data for commits */ + int flags; +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + struct nfs_writeverf verf; /* unstable write verifier */ +}; + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static const struct rpc_call_ops nfs_write_direct_ops; + +static inline void get_dreq(struct nfs_direct_req *dreq) +{ + atomic_inc(&dreq->io_count); +} + +static inline int put_dreq(struct nfs_direct_req *dreq) +{ + return atomic_dec_and_test(&dreq->io_count); +} + +/** + * nfs_direct_IO - NFS address space operation for direct I/O + * @rw: direction (read or write) + * @iocb: target I/O control block + * @iov: array of vectors that define I/O buffer + * @pos: offset in file to begin the operation + * @nr_segs: size of iovec array + * + * The presence of this routine in the address space ops vector means + * the NFS client supports direct I/O. However, we shunt off direct + * read and write requests before the VFS gets them, so this method + * should never be called. + */ +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +{ + dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long) pos, nr_segs); + + return -EINVAL; +} + +static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) +{ + unsigned int npages; + unsigned int i; + + if (count == 0) + return; + pages += (pgbase >> PAGE_SHIFT); + npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + struct page *page = pages[i]; + if (!PageCompound(page)) + set_page_dirty(page); + } +} + +static void nfs_direct_release_pages(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + page_cache_release(pages[i]); +} + +static inline struct nfs_direct_req *nfs_direct_req_alloc(void) +{ + struct nfs_direct_req *dreq; + + dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); + if (!dreq) + return NULL; + + kref_init(&dreq->kref); + kref_get(&dreq->kref); + init_completion(&dreq->completion); + INIT_LIST_HEAD(&dreq->rewrite_list); + dreq->iocb = NULL; + dreq->ctx = NULL; + dreq->l_ctx = NULL; + spin_lock_init(&dreq->lock); + atomic_set(&dreq->io_count, 0); + dreq->count = 0; + dreq->error = 0; + dreq->flags = 0; + + return dreq; +} + +static void nfs_direct_req_free(struct kref *kref) +{ + struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); + if (dreq->ctx != NULL) + put_nfs_open_context(dreq->ctx); + kmem_cache_free(nfs_direct_cachep, dreq); +} + +static void nfs_direct_req_release(struct nfs_direct_req *dreq) +{ + kref_put(&dreq->kref, nfs_direct_req_free); +} + +/* + * Collects and returns the final error value/byte-count. + */ +static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) +{ + ssize_t result = -EIOCBQUEUED; + + /* Async requests don't wait here */ + if (dreq->iocb) + goto out; + + result = wait_for_completion_killable(&dreq->completion); + + if (!result) + result = dreq->error; + if (!result) + result = dreq->count; + +out: + return (ssize_t) result; +} + +/* + * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust + * the iocb is still valid here if this is a synchronous request. + */ +static void nfs_direct_complete(struct nfs_direct_req *dreq) +{ + if (dreq->iocb) { + long res = (long) dreq->error; + if (!res) + res = (long) dreq->count; + aio_complete(dreq->iocb, res, 0); + } + complete_all(&dreq->completion); + + nfs_direct_req_release(dreq); +} + +/* + * We must hold a reference to all the pages in this direct read request + * until the RPCs complete. This could be long *after* we are woken up in + * nfs_direct_wait (for instance, if someone hits ^C on a slow server). + */ +static void nfs_direct_read_result(struct rpc_task *task, void *calldata) +{ + struct nfs_read_data *data = calldata; + + nfs_readpage_result(task, data); +} + +static void nfs_direct_read_release(void *calldata) +{ + + struct nfs_read_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + spin_lock(&dreq->lock); + if (unlikely(status < 0)) { + dreq->error = status; + spin_unlock(&dreq->lock); + } else { + dreq->count += data->res.count; + spin_unlock(&dreq->lock); + nfs_direct_dirty_pages(data->pagevec, + data->args.pgbase, + data->res.count); + } + nfs_direct_release_pages(data->pagevec, data->npages); + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + nfs_readdata_free(data); +} + +static const struct rpc_call_ops nfs_read_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_read_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_read_result, + .rpc_release = nfs_direct_read_release, +}; + +/* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, + * bail and stop sending more reads. Read length accounting is + * handled automatically by nfs_direct_read_result(). Otherwise, if + * no requests have been sent, just return an error. + */ +static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_read_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + unsigned int pgbase; + int result; + ssize_t started = 0; + + do { + struct nfs_read_data *data; + size_t bytes; + + pgbase = user_addr & ~PAGE_MASK; + bytes = min(rsize,count); + + result = -ENOMEM; + data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); + if (unlikely(!data)) + break; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 1, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + nfs_readdata_free(data); + break; + } + if ((unsigned)result < data->npages) { + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_readdata_free(data); + break; + } + bytes -= pgbase; + data->npages = result; + } + + get_dreq(dreq); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; + nfs_fattr_init(&data->fattr); + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + NFS_PROTO(inode)->read_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + break; + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct read call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + started += bytes; + user_addr += bytes; + pos += bytes; + /* FIXME: Remove this unnecessary math from final patch */ + pgbase += bytes; + pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); + + count -= bytes; + } while (count != 0); + + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_read_schedule_segment(dreq, vec, pos); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_complete(dreq); + return 0; +} + +static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t result = -ENOMEM; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct nfs_direct_req *dreq; + + dreq = nfs_direct_req_alloc(); + if (dreq == NULL) + goto out; + + dreq->inode = inode; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); + if (!result) + result = nfs_direct_wait(dreq); +out_release: + nfs_direct_req_release(dreq); +out: + return result; +} + +static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) +{ + while (!list_empty(&dreq->rewrite_list)) { + struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); + list_del(&data->pages); + nfs_direct_release_pages(data->pagevec, data->npages); + nfs_writedata_free(data); + } +} + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) +{ + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + dreq->count = 0; + get_dreq(dreq); + + list_for_each(p, &dreq->rewrite_list) { + data = list_entry(p, struct nfs_write_data, pages); + + get_dreq(dreq); + + /* Use stable writes */ + data->args.stable = NFS_FILE_SYNC; + + /* + * Reset data->res. + */ + nfs_fattr_init(&data->fattr); + data->res.count = data->args.count; + memset(&data->verf, 0, sizeof(data->verf)); + + /* + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, &msg); + + /* + * We're called via an RPC callback, so BKL is already held. + */ + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task(task); + + dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, inode); +} + +static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + /* Call the NFS version-specific code */ + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_direct_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + if (status < 0) { + dprintk("NFS: %5u commit failed with error %d.\n", + data->task.tk_pid, status); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); + nfs_direct_write_complete(dreq, data->inode); + nfs_commit_free(data); +} + +static const struct rpc_call_ops nfs_commit_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_commit_result, + .rpc_release = nfs_direct_commit_release, +}; + +static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) +{ + struct nfs_write_data *data = dreq->commit_data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = dreq->ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(dreq->inode), + .rpc_message = &msg, + .callback_ops = &nfs_commit_direct_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + + data->inode = dreq->inode; + data->cred = msg.rpc_cred; + + data->args.fh = NFS_FH(data->inode); + data->args.offset = 0; + data->args.count = 0; + data->args.context = dreq->ctx; + data->args.lock_context = dreq->l_ctx; + data->res.count = 0; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ + dreq->commit_data = NULL; + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task(task); +} + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + int flags = dreq->flags; + + dreq->flags = 0; + switch (flags) { + case NFS_ODIRECT_DO_COMMIT: + nfs_direct_commit_schedule(dreq); + break; + case NFS_ODIRECT_RESCHED_WRITES: + nfs_direct_write_reschedule(dreq); + break; + default: + if (dreq->commit_data != NULL) + nfs_commit_free(dreq->commit_data); + nfs_direct_free_writedata(dreq); + nfs_zap_mapping(inode, inode->i_mapping); + nfs_direct_complete(dreq); + } +} + +static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = nfs_commitdata_alloc(); + if (dreq->commit_data != NULL) + dreq->commit_data->req = (struct nfs_page *) dreq; +} +#else +static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +{ + dreq->commit_data = NULL; +} + +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +{ + nfs_direct_free_writedata(dreq); + nfs_zap_mapping(inode, inode->i_mapping); + nfs_direct_complete(dreq); +} +#endif + +static void nfs_direct_write_result(struct rpc_task *task, void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_writeback_done(task, data); +} + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +static void nfs_direct_write_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + spin_lock(&dreq->lock); + + if (unlikely(status < 0)) { + /* An error has occurred, so we should not commit */ + dreq->flags = 0; + dreq->error = status; + } + if (unlikely(dreq->error != 0)) + goto out_unlock; + + dreq->count += data->res.count; + + if (data->res.verf->committed != NFS_FILE_SYNC) { + switch (dreq->flags) { + case 0: + memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); + dreq->flags = NFS_ODIRECT_DO_COMMIT; + break; + case NFS_ODIRECT_DO_COMMIT: + if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { + dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + } + } + } +out_unlock: + spin_unlock(&dreq->lock); + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, data->inode); +} + +static const struct rpc_call_ops nfs_write_direct_ops = { +#if defined(CONFIG_NFS_V4_1) + .rpc_call_prepare = nfs_write_prepare, +#endif /* CONFIG_NFS_V4_1 */ + .rpc_call_done = nfs_direct_write_result, + .rpc_release = nfs_direct_write_release, +}; + +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. + */ +static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, + const struct iovec *iov, + loff_t pos, int sync) +{ + struct nfs_open_context *ctx = dreq->ctx; + struct inode *inode = ctx->dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, + .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + }; + size_t wsize = NFS_SERVER(inode)->wsize; + unsigned int pgbase; + int result; + ssize_t started = 0; + + do { + struct nfs_write_data *data; + size_t bytes; + + pgbase = user_addr & ~PAGE_MASK; + bytes = min(wsize,count); + + result = -ENOMEM; + data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); + if (unlikely(!data)) + break; + + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + data->npages, 0, 0, data->pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) { + nfs_writedata_free(data); + break; + } + if ((unsigned)result < data->npages) { + bytes = result * PAGE_SIZE; + if (bytes <= pgbase) { + nfs_direct_release_pages(data->pagevec, result); + nfs_writedata_free(data); + break; + } + bytes -= pgbase; + data->npages = result; + } + + get_dreq(dreq); + + list_move_tail(&data->pages, &dreq->rewrite_list); + + data->req = (struct nfs_page *) dreq; + data->inode = inode; + data->cred = msg.rpc_cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; + data->args.offset = pos; + data->args.pgbase = pgbase; + data->args.pages = data->pagevec; + data->args.count = bytes; + data->args.stable = sync; + data->res.fattr = &data->fattr; + data->res.count = bytes; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + + task_setup_data.task = &data->task; + task_setup_data.callback_data = data; + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + break; + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct write call " + "(req %s/%Ld, %zu bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + started += bytes; + user_addr += bytes; + pos += bytes; + + /* FIXME: Remove this useless math from the final patch */ + pgbase += bytes; + pgbase &= ~PAGE_MASK; + BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); + + count -= bytes; + } while (count != 0); + + if (started) + return started; + return result < 0 ? (ssize_t) result : -EFAULT; +} + +static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos, int sync) +{ + ssize_t result = 0; + size_t requested_bytes = 0; + unsigned long seg; + + get_dreq(dreq); + + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *vec = &iov[seg]; + result = nfs_direct_write_schedule_segment(dreq, vec, + pos, sync); + if (result < 0) + break; + requested_bytes += result; + if ((size_t)result < vec->iov_len) + break; + pos += vec->iov_len; + } + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); + return 0; +} + +static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + size_t count) +{ + ssize_t result = -ENOMEM; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct nfs_direct_req *dreq; + size_t wsize = NFS_SERVER(inode)->wsize; + int sync = NFS_UNSTABLE; + + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out; + nfs_alloc_commit_data(dreq); + + if (dreq->commit_data == NULL || count <= wsize) + sync = NFS_FILE_SYNC; + + dreq->inode = inode; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); + if (!result) + result = nfs_direct_wait(dreq); +out_release: + nfs_direct_req_release(dreq); +out: + return result; +} + +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers into which to read data + * @nr_segs: size of iov vector + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t retval = -EINVAL; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count; + + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + count, (long long) pos); + + retval = 0; + if (!count) + goto out; + + retval = nfs_sync_mapping(mapping); + if (retval) + goto out; + + task_io_account_read(count); + + retval = nfs_direct_read(iocb, iov, nr_segs, pos); + if (retval > 0) + iocb->ki_pos = pos + retval; + +out: + return retval; +} + +/** + * nfs_file_direct_write - file direct write operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers from which to write data + * @nr_segs: size of iov vector + * @pos: byte offset in file where writing starts + * + * We use this function for direct writes instead of calling + * generic_file_aio_write() in order to avoid taking the inode + * semaphore and updating the i_size. The NFS server will set + * the new i_size and this client must read the updated size + * back into its cache. We let the server do generic write + * parameter checking and report problems. + * + * We eliminate local atime updates, see direct read above. + * + * We avoid unnecessary page cache invalidations for normal cached + * readers of this file. + * + * Note that O_APPEND is not supported for NFS direct writes, as there + * is no atomic O_APPEND write facility in the NFS protocol. + */ +ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t retval = -EINVAL; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count; + + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); + + dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + count, (long long) pos); + + retval = generic_write_checks(file, &pos, &count, 0); + if (retval) + goto out; + + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; + + retval = nfs_sync_mapping(mapping); + if (retval) + goto out; + + task_io_account_write(count); + + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + + if (retval > 0) + iocb->ki_pos = pos + retval; + +out: + return retval; +} + +/** + * nfs_init_directcache - create a slab cache for nfs_direct_req structures + * + */ +int __init nfs_init_directcache(void) +{ + nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", + sizeof(struct nfs_direct_req), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + if (nfs_direct_cachep == NULL) + return -ENOMEM; + + return 0; +} + +/** + * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures + * + */ +void nfs_destroy_directcache(void) +{ + kmem_cache_destroy(nfs_direct_cachep); +} diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 00000000000..a6e711ad130 --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,372 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/sunrpc/clnt.h> +#include <linux/dns_resolver.h> + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + char *ip_addr = NULL; + int ip_len; + + ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); + if (ip_len > 0) + ret = rpc_pton(ip_addr, ip_len, sa, salen); + else + ret = -ESRCH; + kfree(ip_addr); + return ret; +} + +#else + +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + nfs_dns_ent_update(cnew, ckey); + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = item->h.expiry_time - seconds_since_boot(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + seconds_since_boot(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < seconds_since_boot() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + +#endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 00000000000..199bb5543a9 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,26 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + + +#ifdef CONFIG_NFS_USE_KERNEL_DNS +static inline int nfs_dns_resolver_init(void) +{ + return 0; +} + +static inline void nfs_dns_resolver_destroy(void) +{} +#else +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +#endif + +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c new file mode 100644 index 00000000000..c43a452f7da --- /dev/null +++ b/fs/nfs/file.c @@ -0,0 +1,899 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + * + * Changes Copyright (C) 1994 by Florian La Roche + * - Do not copy data too often around in the kernel. + * - In nfs_file_read the return value of kmalloc wasn't checked. + * - Put in a better version of read look-ahead buffering. Original idea + * and implementation by Wai S Kok elekokws@ee.nus.sg. + * + * Expire cache on write to a file by Wai S Kok (Oct 1994). + * + * Total rewrite of read side for new NFS buffer cache.. Linus. + * + * nfs regular file handling functions + */ + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/aio.h> +#include <linux/gfp.h> +#include <linux/swap.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include "delegation.h" +#include "internal.h" +#include "iostat.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_FILE + +static const struct vm_operations_struct nfs_file_vm_ops; + +const struct inode_operations nfs_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +#ifdef CONFIG_NFS_V3 +const struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_v3 */ + +/* Hack for future NFS swap support */ +#ifndef IS_SWAPFILE +# define IS_SWAPFILE(inode) (0) +#endif + +static int nfs_check_flags(int flags) +{ + if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) + return -EINVAL; + + return 0; +} + +/* + * Open file + */ +static int +nfs_file_open(struct inode *inode, struct file *filp) +{ + int res; + + dprintk("NFS: open file(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); + res = nfs_check_flags(filp->f_flags); + if (res) + return res; + + res = nfs_open(inode, filp); + return res; +} + +static int +nfs_file_release(struct inode *inode, struct file *filp) +{ + dprintk("NFS: release(%s/%s)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); + return nfs_release(inode, filp); +} + +/** + * nfs_revalidate_size - Revalidate the file size + * @inode - pointer to inode struct + * @file - pointer to struct file + * + * Revalidates the file length. This is basically a wrapper around + * nfs_revalidate_inode() that takes into account the fact that we may + * have cached writes (in which case we don't care about the server's + * idea of what the file length is), or O_DIRECT (in which case we + * shouldn't trust the cache). + */ +static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfs_have_delegated_attributes(inode)) + goto out_noreval; + + if (filp->f_flags & O_DIRECT) + goto force_reval; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + goto force_reval; + if (nfs_attribute_timeout(inode)) + goto force_reval; +out_noreval: + return 0; +force_reval: + return __nfs_revalidate_inode(server, inode); +} + +static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +{ + dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + offset, origin); + + /* + * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (origin != SEEK_SET && origin != SEEK_CUR) { + struct inode *inode = filp->f_mapping->host; + + int retval = nfs_revalidate_file_size(inode, filp); + if (retval < 0) + return (loff_t)retval; + } + + return generic_file_llseek(filp, offset, origin); +} + +/* + * Flush all dirty pages, and check for write errors. + */ +static int +nfs_file_flush(struct file *file, fl_owner_t id) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + dprintk("NFS: flush(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} + +static ssize_t +nfs_file_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct dentry * dentry = iocb->ki_filp->f_path.dentry; + struct inode * inode = dentry->d_inode; + ssize_t result; + + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_read(iocb, iov, nr_segs, pos); + + dprintk("NFS: read(%s/%s, %lu@%lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); + + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + if (!result) { + result = generic_file_aio_read(iocb, iov, nr_segs, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } + return result; +} + +static ssize_t +nfs_file_splice_read(struct file *filp, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + ssize_t res; + + dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long long) *ppos); + + res = nfs_revalidate_mapping(inode, filp->f_mapping); + if (!res) { + res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } + return res; +} + +static int +nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + int status; + + dprintk("NFS: mmap(%s/%s)\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); + if (!status) { + vma->vm_ops = &nfs_file_vm_ops; + status = nfs_revalidate_mapping(inode, file->f_mapping); + } + return status; +} + +/* + * Flush any dirty pages for this process, and check for write errors. + * The return status from this call provides a reliable indication of + * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occurred, and hence cause it to + * fall back to doing a synchronous write. + */ +static int +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file->f_path.dentry; + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = dentry->d_inode; + int have_error, status; + int ret = 0; + + dprintk("NFS: fsync file(%s/%s) datasync %d\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + datasync); + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + mutex_lock(&inode->i_mutex); + + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + if (status >= 0 && ret < 0) + status = ret; + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret && status < 0) + ret = status; + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* + * This does the "real" work of the write. We must allocate and lock the + * page to be sent back to the generic routine, which then copies the + * data from user space. + * + * If the writer ends up delaying the write, the writer needs to + * increment the page use counts until he is done with the page. + */ +static int nfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + +start: + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = nfs_flush_incompatible(file, page); + if (ret) { + unlock_page(page); + page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; + } + return ret; +} + +static int nfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + + status = nfs_updatepage(file, page, offset, copied); + + unlock_page(page); + page_cache_release(page); + + if (status < 0) + return status; + return copied; +} + +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ +static void nfs_invalidate_page(struct page *page, unsigned long offset) +{ + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + + if (offset != 0) + return; + /* Cancel any unstarted writes on this page */ + nfs_wb_page_cancel(page->mapping->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); +} + +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ +static int nfs_release_page(struct page *page, gfp_t gfp) +{ + struct address_space *mapping = page->mapping; + + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + + /* Only do I/O if gfp is a superset of GFP_KERNEL */ + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } + /* If PagePrivate() is set, then the page is not freeable */ + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); +} + +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ +static int nfs_launder_page(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + nfs_fscache_wait_on_page_write(nfsi, page); + return nfs_wb_page(inode, page); +} + +const struct address_space_operations nfs_file_aops = { + .readpage = nfs_readpage, + .readpages = nfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = nfs_writepage, + .writepages = nfs_writepages, + .write_begin = nfs_write_begin, + .write_end = nfs_write_end, + .invalidatepage = nfs_invalidate_page, + .releasepage = nfs_release_page, + .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, + .launder_page = nfs_launder_page, + .error_remove_page = generic_error_remove_page, +}; + +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct dentry *dentry = filp->f_path.dentry; + unsigned pagelen; + int ret = VM_FAULT_NOPAGE; + struct address_space *mapping; + + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) + goto out_unlock; + + pagelen = nfs_page_length(page); + if (pagelen == 0) + goto out_unlock; + + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; + + ret = VM_FAULT_SIGBUS; +out_unlock: + unlock_page(page); +out: + return ret; +} + +static const struct vm_operations_struct nfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nfs_vm_page_mkwrite, +}; + +static int nfs_need_sync_write(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx; + + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) + return 1; + ctx = nfs_file_open_context(filp); + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) + return 1; + return 0; +} + +static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct dentry * dentry = iocb->ki_filp->f_path.dentry; + struct inode * inode = dentry->d_inode; + unsigned long written = 0; + ssize_t result; + size_t count = iov_length(iov, nr_segs); + + if (iocb->ki_filp->f_flags & O_DIRECT) + return nfs_file_direct_write(iocb, iov, nr_segs, pos); + + dprintk("NFS: write(%s/%s, %lu@%Ld)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (long long) pos); + + result = -EBUSY; + if (IS_SWAPFILE(inode)) + goto out_swapfile; + /* + * O_APPEND implies that we must revalidate the file length. + */ + if (iocb->ki_filp->f_flags & O_APPEND) { + result = nfs_revalidate_file_size(inode, iocb->ki_filp); + if (result) + goto out; + } + + result = count; + if (!count) + goto out; + + result = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (result > 0) + written = result; + + /* Return error values for O_DSYNC and IS_SYNC() */ + if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { + int err = vfs_fsync(iocb->ki_filp, 0); + if (err < 0) + result = err; + } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); +out: + return result; + +out_swapfile: + printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); + goto out; +} + +static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, + struct file *filp, loff_t *ppos, + size_t count, unsigned int flags) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + unsigned long written = 0; + ssize_t ret; + + dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + (unsigned long) count, (unsigned long long) *ppos); + + /* + * The combination of splice and an O_APPEND destination is disallowed. + */ + + ret = generic_file_splice_write(pipe, filp, ppos, count, flags); + if (ret > 0) + written = ret; + + if (ret >= 0 && nfs_need_sync_write(filp, inode)) { + int err = vfs_fsync(filp, 0); + if (err < 0) + ret = err; + } + if (ret > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + return ret; +} + +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status = 0; + unsigned int saved_type = fl->fl_type; + + /* Try local locking first */ + posix_test_lock(filp, fl); + if (fl->fl_type != F_UNLCK) { + /* found a conflict */ + goto out; + } + fl->fl_type = saved_type; + + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_noconflict; + + if (is_local) + goto out_noconflict; + + status = NFS_PROTO(inode)->lock(filp, cmd, fl); +out: + return status; +out_noconflict: + fl->fl_type = F_UNLCK; + goto out; +} + +static int do_vfs_lock(struct file *file, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_file_wait(file, fl); + break; + case FL_FLOCK: + res = flock_lock_file_wait(file, fl); + break; + default: + BUG(); + } + return res; +} + +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + nfs_sync_mapping(filp->f_mapping); + + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + return status; +} + +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) +{ + struct inode *inode = filp->f_mapping->host; + int status; + + /* + * Flush all pending writes before doing anything + * with locks.. + */ + status = nfs_sync_mapping(filp->f_mapping); + if (status != 0) + goto out; + + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = do_vfs_lock(filp, fl); + if (status < 0) + goto out; + + /* + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * + * This makes locking act as a cache coherency point. + */ + nfs_sync_mapping(filp->f_mapping); + if (!nfs_have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } +out: + return status; +} + +/* + * Lock a (portion of) a file + */ +static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int ret = -ENOLCK; + int is_local = 0; + + dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + fl->fl_type, fl->fl_flags, + (long long)fl->fl_start, (long long)fl->fl_end); + + nfs_inc_stats(inode, NFSIOS_VFSLOCK); + + /* No mandatory locks over NFS */ + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + goto out_err; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } + + if (IS_GETLK(cmd)) + ret = do_getlk(filp, cmd, fl, is_local); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl, is_local); + else + ret = do_setlk(filp, cmd, fl, is_local); +out_err: + return ret; +} + +/* + * Lock a (portion of) a file + */ +static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +{ + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + + dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name, + fl->fl_type, fl->fl_flags); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + + /* We're simulating flock() locks using posix locks on the server */ + fl->fl_owner = (fl_owner_t)filp; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + + if (fl->fl_type == F_UNLCK) + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); +} + +/* + * There is no protocol support for leases, so we have no way to implement + * them correctly in the face of opens by other clients. + */ +static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) +{ + dprintk("NFS: setlease(%s/%s, arg=%ld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, arg); + return -EINVAL; +} + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c new file mode 100644 index 00000000000..7cf2c4699b0 --- /dev/null +++ b/fs/nfs/fscache-index.c @@ -0,0 +1,337 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + memset(key, 0, len); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c new file mode 100644 index 00000000000..419119c371b --- /dev/null +++ b/fs/nfs/fscache.c @@ -0,0 +1,535 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + * + * The default uniquifier is just an empty string, but it may be overridden + * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent + * superblock across an automount point of some nature. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, + struct nfs_clone_mount *mntdata) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + int diff, ulen; + + if (uniq) { + ulen = strlen(uniq); + } else if (mntdata) { + struct nfs_server *mnt_s = NFS_SB(mntdata->sb); + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + } + } + + if (!uniq) { + uniq = ""; + ulen = 1; + } + + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode_cookie(struct inode *inode) +{ + NFS_I(inode)->fscache = NULL; + if (S_ISREG(inode->i_mode)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); +} + +/* + * Get the per-inode cache cookie for an NFS inode. + */ +static void nfs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->fscache || !NFS_FSCACHE(inode)) + return; + + if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { + nfsi->fscache = fscache_acquire_cookie( + NFS_SB(sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", + sb, nfsi, nfsi->fscache); + } +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_release_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 0); + nfsi->fscache = NULL; +} + +/* + * Retire a per-inode cookie, destroying the data attached to it. + */ +void nfs_fscache_zap_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", + nfsi, nfsi->fscache); + + fscache_relinquish_cookie(nfsi->fscache, 1); + nfsi->fscache = NULL; +} + +/* + * Turn off the cache with regard to a per-inode cookie if opened for writing, + * invalidating all the pages in the page cache relating to the associated + * inode to clear the per-page caching. + */ +static void nfs_fscache_disable_inode_cookie(struct inode *inode) +{ + clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + + if (NFS_I(inode)->fscache) { + dfprintk(FSCACHE, + "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); + + /* Need to uncache any pages attached to this inode that + * fscache knows about before turning off the cache. + */ + fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); + nfs_fscache_zap_inode_cookie(inode); + } +} + +/* + * wait_on_bit() sleep function for uninterruptible waiting + */ +static int nfs_fscache_wait_bit(void *flags) +{ + schedule(); + return 0; +} + +/* + * Lock against someone else trying to also acquire or relinquish a cookie + */ +static inline void nfs_fscache_inode_lock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) + wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, + nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); +} + +/* + * Unlock cookie management lock + */ +static inline void nfs_fscache_inode_unlock(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + smp_mb__before_clear_bit(); + clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); +} + +/* + * Decide if we should enable or disable local caching for this inode. + * - For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * - May be invoked multiple times in parallel by parallel nfs_open() functions. + */ +void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if (NFS_FSCACHE(inode)) { + nfs_fscache_inode_lock(inode); + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + nfs_fscache_disable_inode_cookie(inode); + else + nfs_fscache_enable_inode_cookie(inode); + nfs_fscache_inode_unlock(inode); + } +} + +/* + * Replace a per-inode cookie due to revalidation detecting a file having + * changed on the server. + */ +void nfs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *nfss = NFS_SERVER(inode); + struct fscache_cookie *old = nfsi->fscache; + + nfs_fscache_inode_lock(inode); + if (nfsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(nfsi->fscache, 1); + + nfsi->fscache = fscache_acquire_cookie( + nfss->nfs_client->fscache, + &nfs_fscache_inode_object_def, + nfsi); + + dfprintk(FSCACHE, + "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", + nfss, nfsi, old, nfsi->fscache); + } + nfs_fscache_inode_unlock(inode); +} + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, nfsi); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + unsigned npages = *nr_pages; + int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + NFS_I(inode)->fscache, npages, inode); + + ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + NFS_I(inode)->fscache, page, page->index, page->flags, sync); + + ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(NFS_I(inode)->fscache, page); + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } else { + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + } +} diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h new file mode 100644 index 00000000000..b9c572d0679 --- /dev/null +++ b/fs/nfs/fscache.h @@ -0,0 +1,222 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/fscache.h> + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, + const char *, + struct nfs_clone_mount *); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode_cookie(struct inode *); +extern void nfs_fscache_release_inode_cookie(struct inode *); +extern void nfs_fscache_zap_inode_cookie(struct inode *); +extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void nfs_fscache_reset_inode_cookie(struct inode *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_get_super_cookie( + struct super_block *sb, + const char *uniq, + struct nfs_clone_mount *mntdata) +{ +} +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c new file mode 100644 index 00000000000..dcb61548887 --- /dev/null +++ b/fs/nfs/getroot.c @@ -0,0 +1,267 @@ +/* getroot.c: get the root dentry for an NFS mount + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/namei.h> +#include <linux/security.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Set the superblock root dentry. + * Note that this function frees the inode in case of error. + */ +static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) +{ + /* The mntroot acts as the dummy root dentry for this superblock */ + if (sb->s_root == NULL) { + sb->s_root = d_alloc_root(inode); + if (sb->s_root == NULL) { + iput(inode); + return -ENOMEM; + } + ihold(inode); + /* + * Ensure that this dentry is invisible to d_find_alias(). + * Otherwise, it may be spliced into the tree by + * d_materialise_unique if a parent directory from the same + * filesystem gets mounted at a later time. + * This again causes shrink_dcache_for_umount_subtree() to + * Oops, since the test for IS_ROOT() will fail. + */ + spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&sb->s_root->d_lock); + list_del_init(&sb->s_root->d_alias); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&sb->s_root->d_inode->i_lock); + } + return 0; +} + +/* + * get an NFS2/NFS3 root dentry from the root filehandle + */ +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; + struct dentry *ret; + struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); + int error; + + if (!name) + return ERR_PTR(-ENOMEM); + + /* get the actual root for this mount */ + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + ret = ERR_PTR(error); + goto out; + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + ret = ERR_CAST(inode); + goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); + goto out; + } + + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + if (name) + kfree(name); + nfs_free_fattr(fsinfo.fattr); + return ret; +} + +#ifdef CONFIG_NFS_V4 + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) +{ + struct nfs_fsinfo fsinfo; + int ret = -ENOMEM; + + dprintk("--> nfs4_get_rootfh()\n"); + + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; + } + + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); + ret = -ENOTDIR; + goto out; + } + + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); + ret = -EREMOTE; + goto out; + } + + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; +} + +/* + * get an NFS4 root dentry from the root filehandle + */ +struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fattr *fattr = NULL; + struct dentry *ret; + struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); + int error; + + dprintk("--> nfs4_get_root()\n"); + + if (!name) + return ERR_PTR(-ENOMEM); + + /* get the info about the server and filesystem */ + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs_get_root: getcaps error = %d\n", + -error); + kfree(name); + return ERR_PTR(error); + } + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); + } + + /* get the actual root for this mount */ + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + ret = ERR_PTR(error); + goto out; + } + + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + inode = nfs_fhget(sb, mntfh, fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + ret = ERR_CAST(inode); + goto out; + } + + error = nfs_superblock_set_dummy_root(sb, inode); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { + dprintk("nfs_get_root: get root dentry failed\n"); + goto out; + } + + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + if (name) + kfree(name); + nfs_free_fattr(fattr); + dprintk("<-- nfs4_get_root()\n"); + return ret; +} + +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c new file mode 100644 index 00000000000..24228135c1a --- /dev/null +++ b/fs/nfs/idmap.c @@ -0,0 +1,847 @@ +/* + * fs/nfs/idmap.c + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> +#include <linux/cred.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs_sb.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <linux/rcupdate.h> +#include <linux/err.h> +#include <keys/user-type.h> + +/* include files needed by legacy idmapper */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/init.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/sched.h> +#include <linux/sunrpc/clnt.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" + +#define NFS_UINT_MAXLEN 11 +#define IDMAP_HASH_SZ 128 + +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600 * HZ; +const struct cred *id_resolver_cache; + + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) +{ + fattr->owner_name = owner_name; + fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + __u32 uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + __u32 gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (strict_strtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + +struct key_type key_type_id_resolver = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +int nfs_idmap_init(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void nfs_idmap_quit(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!*desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, void *data, size_t data_size) +{ + const struct cred *saved_cred; + struct key *rkey; + char *desc; + struct user_key_payload *payload; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + goto out; + + saved_cred = override_creds(id_resolver_cache); + rkey = request_key(&key_type_id_resolver, desc, ""); + revert_creds(saved_cred); + kfree(desc); + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW|KEY_USR_WRITE; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.data); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, + const char *type, __u32 *id) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = strict_strtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +/* idmap classic begins here */ +static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + int jif = num * HZ; + if (endp == val || *endp || num < 0 || jif < num) + return -EINVAL; + *((int *)kp->arg) = jif; + return 0; +} + +module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, + &nfs_idmap_cache_timeout, 0644); + +struct idmap_hashent { + unsigned long ih_expires; + __u32 ih_id; + size_t ih_namelen; + const char *ih_name; +}; + +struct idmap_hashtable { + __u8 h_type; + struct idmap_hashent *h_entries; +}; + +struct idmap { + struct dentry *idmap_dentry; + wait_queue_head_t idmap_wq; + struct idmap_msg idmap_im; + struct mutex idmap_lock; /* Serializes upcalls */ + struct mutex idmap_im_lock; /* Protects the hashtable */ + struct idmap_hashtable idmap_user_hash; + struct idmap_hashtable idmap_group_hash; +}; + +static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); +static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); + +static unsigned int fnvhash32(const void *, size_t); + +static const struct rpc_pipe_ops idmap_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = idmap_pipe_downcall, + .destroy_msg = idmap_pipe_destroy_msg, +}; + +int +nfs_idmap_new(struct nfs_client *clp) +{ + struct idmap *idmap; + int error; + + BUG_ON(clp->cl_idmap != NULL); + + idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); + if (idmap == NULL) + return -ENOMEM; + + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); + if (IS_ERR(idmap->idmap_dentry)) { + error = PTR_ERR(idmap->idmap_dentry); + kfree(idmap); + return error; + } + + mutex_init(&idmap->idmap_lock); + mutex_init(&idmap->idmap_im_lock); + init_waitqueue_head(&idmap->idmap_wq); + idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; + idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; + + clp->cl_idmap = idmap; + return 0; +} + +static void +idmap_alloc_hashtable(struct idmap_hashtable *h) +{ + if (h->h_entries != NULL) + return; + h->h_entries = kcalloc(IDMAP_HASH_SZ, + sizeof(*h->h_entries), + GFP_KERNEL); +} + +static void +idmap_free_hashtable(struct idmap_hashtable *h) +{ + int i; + + if (h->h_entries == NULL) + return; + for (i = 0; i < IDMAP_HASH_SZ; i++) + kfree(h->h_entries[i].ih_name); + kfree(h->h_entries); +} + +void +nfs_idmap_delete(struct nfs_client *clp) +{ + struct idmap *idmap = clp->cl_idmap; + + if (!idmap) + return; + rpc_unlink(idmap->idmap_dentry); + clp->cl_idmap = NULL; + idmap_free_hashtable(&idmap->idmap_user_hash); + idmap_free_hashtable(&idmap->idmap_group_hash); + kfree(idmap); +} + +/* + * Helper routines for manipulating the hashtable + */ +static inline struct idmap_hashent * +idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) +{ + if (h->h_entries == NULL) + return NULL; + return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; +} + +static struct idmap_hashent * +idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) +{ + struct idmap_hashent *he = idmap_name_hash(h, name, len); + + if (he == NULL) + return NULL; + if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; + return he; +} + +static inline struct idmap_hashent * +idmap_id_hash(struct idmap_hashtable* h, __u32 id) +{ + if (h->h_entries == NULL) + return NULL; + return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; +} + +static struct idmap_hashent * +idmap_lookup_id(struct idmap_hashtable *h, __u32 id) +{ + struct idmap_hashent *he = idmap_id_hash(h, id); + + if (he == NULL) + return NULL; + if (he->ih_id != id || he->ih_namelen == 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; + return he; +} + +/* + * Routines for allocating new entries in the hashtable. + * For now, we just have 1 entry per bucket, so it's all + * pretty trivial. + */ +static inline struct idmap_hashent * +idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) +{ + idmap_alloc_hashtable(h); + return idmap_name_hash(h, name, len); +} + +static inline struct idmap_hashent * +idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +{ + idmap_alloc_hashtable(h); + return idmap_id_hash(h, id); +} + +static void +idmap_update_entry(struct idmap_hashent *he, const char *name, + size_t namelen, __u32 id) +{ + char *str = kmalloc(namelen + 1, GFP_KERNEL); + if (str == NULL) + return; + kfree(he->ih_name); + he->ih_id = id; + memcpy(str, name, namelen); + str[namelen] = '\0'; + he->ih_name = str; + he->ih_namelen = namelen; + he->ih_expires = jiffies + nfs_idmap_cache_timeout; +} + +/* + * Name -> ID + */ +static int +nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, + const char *name, size_t namelen, __u32 *id) +{ + struct rpc_pipe_msg msg; + struct idmap_msg *im; + struct idmap_hashent *he; + DECLARE_WAITQUEUE(wq, current); + int ret = -EIO; + + im = &idmap->idmap_im; + + /* + * String sanity checks + * Note that the userland daemon expects NUL terminated strings + */ + for (;;) { + if (namelen == 0) + return -EINVAL; + if (name[namelen-1] != '\0') + break; + namelen--; + } + if (namelen >= IDMAP_NAMESZ) + return -EINVAL; + + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); + + he = idmap_lookup_name(h, name, namelen); + if (he != NULL) { + *id = he->ih_id; + ret = 0; + goto out; + } + + memset(im, 0, sizeof(*im)); + memcpy(im->im_name, name, namelen); + + im->im_type = h->h_type; + im->im_conv = IDMAP_CONV_NAMETOID; + + memset(&msg, 0, sizeof(msg)); + msg.data = im; + msg.len = sizeof(*im); + + add_wait_queue(&idmap->idmap_wq, &wq); + if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { + remove_wait_queue(&idmap->idmap_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&idmap->idmap_im_lock); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&idmap->idmap_wq, &wq); + mutex_lock(&idmap->idmap_im_lock); + + if (im->im_status & IDMAP_STATUS_SUCCESS) { + *id = im->im_id; + ret = 0; + } + + out: + memset(im, 0, sizeof(*im)); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); + return ret; +} + +/* + * ID -> Name + */ +static int +nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, + __u32 id, char *name) +{ + struct rpc_pipe_msg msg; + struct idmap_msg *im; + struct idmap_hashent *he; + DECLARE_WAITQUEUE(wq, current); + int ret = -EIO; + unsigned int len; + + im = &idmap->idmap_im; + + mutex_lock(&idmap->idmap_lock); + mutex_lock(&idmap->idmap_im_lock); + + he = idmap_lookup_id(h, id); + if (he) { + memcpy(name, he->ih_name, he->ih_namelen); + ret = he->ih_namelen; + goto out; + } + + memset(im, 0, sizeof(*im)); + im->im_type = h->h_type; + im->im_conv = IDMAP_CONV_IDTONAME; + im->im_id = id; + + memset(&msg, 0, sizeof(msg)); + msg.data = im; + msg.len = sizeof(*im); + + add_wait_queue(&idmap->idmap_wq, &wq); + + if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { + remove_wait_queue(&idmap->idmap_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&idmap->idmap_im_lock); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&idmap->idmap_wq, &wq); + mutex_lock(&idmap->idmap_im_lock); + + if (im->im_status & IDMAP_STATUS_SUCCESS) { + if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) + goto out; + memcpy(name, im->im_name, len); + ret = len; + } + + out: + memset(im, 0, sizeof(*im)); + mutex_unlock(&idmap->idmap_im_lock); + mutex_unlock(&idmap->idmap_lock); + return ret; +} + +static ssize_t +idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); + struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_msg im_in, *im = &idmap->idmap_im; + struct idmap_hashtable *h; + struct idmap_hashent *he = NULL; + size_t namelen_in; + int ret; + + if (mlen != sizeof(im_in)) + return -ENOSPC; + + if (copy_from_user(&im_in, src, mlen) != 0) + return -EFAULT; + + mutex_lock(&idmap->idmap_im_lock); + + ret = mlen; + im->im_status = im_in.im_status; + /* If we got an error, terminate now, and wake up pending upcalls */ + if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { + wake_up(&idmap->idmap_wq); + goto out; + } + + /* Sanity checking of strings */ + ret = -EINVAL; + namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) + goto out; + + switch (im_in.im_type) { + case IDMAP_TYPE_USER: + h = &idmap->idmap_user_hash; + break; + case IDMAP_TYPE_GROUP: + h = &idmap->idmap_group_hash; + break; + default: + goto out; + } + + switch (im_in.im_conv) { + case IDMAP_CONV_IDTONAME: + /* Did we match the current upcall? */ + if (im->im_conv == IDMAP_CONV_IDTONAME + && im->im_type == im_in.im_type + && im->im_id == im_in.im_id) { + /* Yes: copy string, including the terminating '\0' */ + memcpy(im->im_name, im_in.im_name, namelen_in); + im->im_name[namelen_in] = '\0'; + wake_up(&idmap->idmap_wq); + } + he = idmap_alloc_id(h, im_in.im_id); + break; + case IDMAP_CONV_NAMETOID: + /* Did we match the current upcall? */ + if (im->im_conv == IDMAP_CONV_NAMETOID + && im->im_type == im_in.im_type + && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in + && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { + im->im_id = im_in.im_id; + wake_up(&idmap->idmap_wq); + } + he = idmap_alloc_name(h, im_in.im_name, namelen_in); + break; + default: + goto out; + } + + /* If the entry is valid, also copy it to the cache */ + if (he != NULL) + idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); + ret = mlen; +out: + mutex_unlock(&idmap->idmap_im_lock); + return ret; +} + +static void +idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct idmap_msg *im = msg->data; + struct idmap *idmap = container_of(im, struct idmap, idmap_im); + + if (msg->errno >= 0) + return; + mutex_lock(&idmap->idmap_im_lock); + im->im_status = IDMAP_STATUS_LOOKUPFAIL; + wake_up(&idmap->idmap_wq); + mutex_unlock(&idmap->idmap_im_lock); +} + +/* + * Fowler/Noll/Vo hash + * http://www.isthe.com/chongo/tech/comp/fnv/ + */ + +#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ +#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ + +static unsigned int fnvhash32(const void *buf, size_t buflen) +{ + const unsigned char *p, *end = (const unsigned char *)buf + buflen; + unsigned int hash = FNV_1_32; + + for (p = buf; p < end; p++) { + hash *= FNV_P_32; + hash ^= (unsigned int)*p; + } + + return hash; +} + +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; + ret = nfs_idmap_lookup_id(name, namelen, "uid", uid); + if (ret < 0) + ret = nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); + return ret; +} + +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (nfs_map_string_to_numeric(name, namelen, gid)) + return 0; + ret = nfs_idmap_lookup_id(name, namelen, "gid", gid); + if (ret < 0) + ret = nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, gid); + return ret; +} + +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) { + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + if (ret < 0) + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + } + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; +} +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) { + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (ret < 0) + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, gid, buf); + } + if (ret < 0) + ret = nfs_map_numeric_to_string(gid, buf, buflen); + return ret; +} diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c new file mode 100644 index 00000000000..f649fba8c38 --- /dev/null +++ b/fs/nfs/inode.c @@ -0,0 +1,1660 @@ +/* + * linux/fs/nfs/inode.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs inode and superblock handling functions + * + * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/metrics.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/inet.h> +#include <linux/nfs_xdr.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/freezer.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" +#include "fscache.h" +#include "dns_resolve.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 + +/* Default is to see 64-bit inode numbers */ +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; + +static void nfs_invalidate_inode(struct inode *); +static int nfs_update_inode(struct inode *, struct nfs_fattr *); + +static struct kmem_cache * nfs_inode_cachep; + +static inline unsigned long +nfs_fattr_to_ino_t(struct nfs_fattr *fattr) +{ + return nfs_fileid_to_ino_t(fattr->fileid); +} + +/** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule(); + return 0; +} + +/** + * nfs_compat_user_ino64 - returns the user-visible inode number + * @fileid: 64-bit fileid + * + * This function returns a 32-bit inode number if the boot parameter + * nfs.enable_ino64 is zero. + */ +u64 nfs_compat_user_ino64(u64 fileid) +{ +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif + + if (enable_ino64) + return fileid; + ino = fileid; + if (sizeof(ino) < sizeof(fileid)) + ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; + return ino; +} + +static void nfs_clear_inode(struct inode *inode) +{ + /* + * The following should never happen... + */ + BUG_ON(nfs_have_writebacks(inode)); + BUG_ON(!list_empty(&NFS_I(inode)->open_files)); + nfs_zap_acl_cache(inode); + nfs_access_zap_cache(inode); + nfs_fscache_release_inode_cookie(inode); +} + +void nfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + nfs_clear_inode(inode); +} + +/** + * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + */ +int nfs_sync_mapping(struct address_space *mapping) +{ + int ret = 0; + + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } + return ret; +} + +/* + * Invalidate the local caches + */ +static void nfs_zap_caches_locked(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int mode = inode->i_mode; + + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = jiffies; + + memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; +} + +void nfs_zap_caches(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_zap_caches_locked(inode); + spin_unlock(&inode->i_lock); +} + +void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) +{ + if (mapping->nrpages != 0) { + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + } +} + +void nfs_zap_acl_cache(struct inode *inode) +{ + void (*clear_acl_cache)(struct inode *); + + clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; + if (clear_acl_cache != NULL) + clear_acl_cache(inode); + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); +} + +void nfs_invalidate_atime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&inode->i_lock); +} + +/* + * Invalidate, but do not unhash, the inode. + * NB: must be called with inode->i_lock held! + */ +static void nfs_invalidate_inode(struct inode *inode) +{ + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_zap_caches_locked(inode); +} + +struct nfs_find_desc { + struct nfs_fh *fh; + struct nfs_fattr *fattr; +}; + +/* + * In NFSv3 we can have 64bit inode numbers. In order to support + * this, and re-exported directories (also seen in NFSv2) + * we are forced to allow 2 different inodes to have the same + * i_ino. + */ +static int +nfs_find_actor(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fh *fh = desc->fh; + struct nfs_fattr *fattr = desc->fattr; + + if (NFS_FILEID(inode) != fattr->fileid) + return 0; + if (nfs_compare_fh(NFS_FH(inode), fh)) + return 0; + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + return 1; +} + +static int +nfs_init_locked(struct inode *inode, void *opaque) +{ + struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_fattr *fattr = desc->fattr; + + set_nfs_fileid(inode, fattr->fileid); + nfs_copy_fh(NFS_FH(inode), desc->fh); + return 0; +} + +/* + * This is our front-end to iget that looks up inodes by file handle + * instead of inode number. + */ +struct inode * +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + struct nfs_find_desc desc = { + .fh = fh, + .fattr = fattr + }; + struct inode *inode = ERR_PTR(-ENOENT); + unsigned long hash; + + nfs_attr_check_mountpoint(sb, fattr); + + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) + goto out_no_inode; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) + goto out_no_inode; + + hash = nfs_fattr_to_ino_t(fattr); + + inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + goto out_no_inode; + } + + if (inode->i_state & I_NEW) { + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; + + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ + inode->i_ino = hash; + + /* We can't support update_atime(), since the server will reset it */ + inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; + inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + /* Deal with crossing mountpoints */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + inode->i_op = &nfs_referral_inode_operations; + else + inode->i_op = &nfs_mountpoint_inode_operations; + inode->i_fop = NULL; + inode->i_flags |= S_AUTOMOUNT; + } + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode->i_version = 0; + inode->i_size = 0; + clear_nlink(inode); + inode->i_uid = -2; + inode->i_gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + + nfsi->read_cache_jiffies = fattr->time_start; + nfsi->attr_gencount = fattr->gencount; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) + inode->i_version = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + set_nlink(inode, fattr->nlink); + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->access_cache = RB_ROOT; + + nfs_fscache_init_inode_cookie(inode); + + unlock_new_inode(inode); + } else + nfs_refresh_inode(inode, fattr); + dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + atomic_read(&inode->i_count)); + +out: + return inode; + +out_no_inode: + dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); + goto out; +} + +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) + +int +nfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct nfs_fattr *fattr; + int error = -ENOMEM; + + nfs_inc_stats(inode, NFSIOS_VFSSETATTR); + + /* skip mode change if it's just for clearing setuid/setgid */ + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + attr->ia_valid &= ~ATTR_SIZE; + } + + /* Optimization: if the end result is no change, don't RPC */ + attr->ia_valid &= NFS_VALID_ATTRS; + if ((attr->ia_valid & ~ATTR_FILE) == 0) + return 0; + + /* Write all dirty data */ + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + /* + * Return any delegations if we're going to change ACLs + */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs_inode_return_delegation(inode); + error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); + if (error == 0) + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + return error; +} + +/** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + loff_t oldsize; + int err; + + err = inode_newsize_ok(inode, offset); + if (err) + goto out; + + spin_lock(&inode->i_lock); + oldsize = inode->i_size; + i_size_write(inode, offset); + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, oldsize, offset); +out: + return err; +} + +/** + * nfs_setattr_update_inode - Update inode metadata after a setattr call. + * @inode: pointer to struct inode + * @attr: pointer to struct iattr + * + * Note: we do this in the *proc.c in order to ensure that + * it works for things like exclusive creates too. + */ +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +{ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); + if ((attr->ia_valid & ATTR_MODE) != 0) { + int mode = attr->ia_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; + } + if ((attr->ia_valid & ATTR_UID) != 0) + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); + } + if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); + nfs_vmtruncate(inode, attr->ia_size); + } +} + +int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + int err; + + /* Flush out writes to the server in order to update c/mtime. */ + if (S_ISREG(inode->i_mode)) { + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto out; + } + + /* + * We may force a getattr if the user cares about atime. + * + * Note that we only have to check the vfsmount flags here: + * - NFS always sets S_NOATIME by so checking it would give a + * bogus result + * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * no point in checking those. + */ + if ((mnt->mnt_flags & MNT_NOATIME) || + ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) + need_atime = 0; + + if (need_atime) + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (!err) { + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + } +out: + return err; +} + +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = ctx->dentry->d_inode; + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} + +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) +{ + struct nfs_open_context *ctx; + struct rpc_cred *cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + put_rpccred(cred); + return ERR_PTR(-ENOMEM); + } + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); + ctx->cred = cred; + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); + return ctx; +} + +struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) +{ + if (ctx != NULL) + atomic_inc(&ctx->lock_context.count); + return ctx; +} + +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode = ctx->dentry->d_inode; + struct super_block *sb = ctx->dentry->d_sb; + + if (!list_empty(&ctx->list)) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) + return; + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); + if (ctx->cred != NULL) + put_rpccred(ctx->cred); + dput(ctx->dentry); + nfs_sb_deactive(sb); + kfree(ctx); +} + +void put_nfs_open_context(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 0); +} + +/* + * Ensure that mmap has a recent RPC credential for use when writing out + * shared pages + */ +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + filp->private_data = get_nfs_open_context(ctx); + spin_lock(&inode->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&inode->i_lock); +} + +/* + * Given an inode, search for an open context with the desired characteristics + */ +struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *pos, *ctx = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &nfsi->open_files, list) { + if (cred != NULL && pos->cred != cred) + continue; + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; + } + spin_unlock(&inode->i_lock); + return ctx; +} + +static void nfs_file_clear_open_context(struct file *filp) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + + if (ctx) { + filp->private_data = NULL; + spin_lock(&inode->i_lock); + list_move_tail(&ctx->list, &NFS_I(inode)->open_files); + spin_unlock(&inode->i_lock); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + } +} + +/* + * These allocate and release file read/write context information. + */ +int nfs_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + nfs_file_set_open_context(filp, ctx); + put_nfs_open_context(ctx); + nfs_fscache_set_inode_cookie(inode, filp); + return 0; +} + +int nfs_release(struct inode *inode, struct file *filp) +{ + nfs_file_clear_open_context(filp); + return 0; +} + +/* + * This function is called whenever some part of NFS notices that + * the cached attributes have to be refreshed. + */ +int +__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + int status = -ESTALE; + struct nfs_fattr *fattr = NULL; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", + inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + + if (is_bad_inode(inode)) + goto out; + if (NFS_STALE(inode)) + goto out; + + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + if (status != 0) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), status); + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + goto out; + } + + status = nfs_refresh_inode(inode, fattr); + if (status) { + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), status); + goto out; + } + + if (nfsi->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + + dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode)); + + out: + nfs_free_fattr(fattr); + return status; +} + +int nfs_attribute_timeout(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +static int nfs_attribute_cache_expired(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) + return 0; + return nfs_attribute_timeout(inode); +} + +/** + * nfs_revalidate_inode - Revalidate the inode attributes + * @server - pointer to nfs_server struct + * @inode - pointer to inode struct + * + * Updates inode attribute information by retrieving the data from the server. + */ +int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +{ + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) + && !nfs_attribute_cache_expired(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); +} + +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (mapping->nrpages != 0) { + int ret = invalidate_inode_pages2(mapping); + if (ret < 0) + return ret; + } + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + spin_unlock(&inode->i_lock); + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); + nfs_fscache_reset_inode_cookie(inode); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", + inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + return 0; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int ret = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_cache_expired(inode) + || NFS_STALE(inode)) { + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + goto out; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + ret = nfs_invalidate_mapping(inode, mapping); +out: + return ret; +} + +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; + + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; + } + /* If we have atomic WCC data, we may update some attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; + } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->npages == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + return ret; +} + +/** + * nfs_check_inode_attributes - verify consistency of the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Verifies the attribute cache. If we have just changed the attributes, + * so that fattr carries weak cache consistency data, then it may + * also update the ctime/mtime/change_attribute. + */ +static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_size, new_isize; + unsigned long invalid = 0; + + + /* Has the inode gone and changed behind our back? */ + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + return -EIO; + + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } + + /* Have any file permissions changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + + /* Has the link count changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + invalid |= NFS_INO_INVALID_ATIME; + + if (invalid != 0) + nfsi->cache_validity |= invalid; + + nfsi->read_cache_jiffies = fattr->time_start; + return 0; +} + +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return 0; + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static atomic_long_t nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + return atomic_long_read(&nfs_attr_generation_counter); +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + return atomic_long_inc_return(&nfs_attr_generation_counter); +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + fattr->owner_name = NULL; + fattr->group_name = NULL; +} + +struct nfs_fattr *nfs_alloc_fattr(void) +{ + struct nfs_fattr *fattr; + + fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + if (fattr != NULL) + nfs_fattr_init(fattr); + return fattr; +} + +struct nfs_fh *nfs_alloc_fhandle(void) +{ + struct nfs_fh *fh; + + fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + if (fh != NULL) + fh->size = 0; + return fh; +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + +/** + * nfs_refresh_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * Check that an RPC call that returned attributes has not overlapped with + * other recent updates of the inode metadata, then decide whether it is + * safe to do a full update of the inode attributes, or whether just to + * call nfs_check_inode_attributes. + */ +int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + spin_lock(&inode->i_lock); + status = nfs_refresh_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + + return status; +} + +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + +/** + * nfs_post_op_update_inode - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. + * + * NB: if the server didn't return any post op attributes, this + * function will force the retrieval of attributes before the next + * NFS request. Thus it should be used only for operations that + * are expected to change one or more attributes, to avoid + * unnecessary NFS requests and trips through nfs_update_inode(). + */ +int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); + goto out_noforce; + } + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { + fattr->pre_change_attr = inode->i_version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { + memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { + memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { + fattr->pre_size = i_size_read(inode); + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; + } +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; +} + +/* + * Many nfs protocol calls return the new file attributes after + * an operation. Here we update the inode to reflect the state + * of the server's inode. + * + * This is a bit tricky because we have to make sure all dirty pages + * have been sent off to the server before calling invalidate_inode_pages. + * To make sure no other process adds more write requests while we try + * our best to flush them, we make them sleep during the attribute refresh. + * + * A very similar scenario holds for the dir cache. + */ +static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_isize, new_isize; + unsigned long invalid = 0; + unsigned long now = jiffies; + unsigned long save_cache_validity; + + dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, + atomic_read(&inode->i_count), fattr->valid); + + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + goto out_fileid; + + /* + * Make sure the inode's type hasn't changed. + */ + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + goto out_changed; + + server = NFS_SERVER(inode); + /* Update the fsid? */ + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && + !nfs_fsid_equal(&server->fsid, &fattr->fsid) && + !IS_AUTOMOUNT(inode)) + server->fsid = fattr->fsid; + + /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; + + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); + + /* Do atomic weak cache consistency updates */ + invalid |= nfs_wcc_update_inode(inode, fattr); + + /* More cache consistency checks */ + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (inode->i_version != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + inode->i_version = fattr->change_attr; + } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + /* NFSv2/v3: Check if the mtime agrees */ + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { + dprintk("NFS: mtime change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + /* If ctime has changed we should definitely clear access+acl caches */ + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + /* and probably clear data for a directory too as utimes can cause + * havoc with our cache. + */ + if (S_ISDIR(inode->i_mode)) { + invalid |= NFS_INO_INVALID_DATA; + nfs_force_lookup_revalidate(inode); + } + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + /* Check if our cached file size is stale */ + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || + new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } + dprintk("NFS: isize change on server for file %s/%ld " + "(%Ld to %Ld)\n", + inode->i_sb->s_id, + inode->i_ino, + (long long)cur_isize, + (long long)new_isize); + } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (inode->i_uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (inode->i_gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + set_nlink(inode, fattr->nlink); + } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + /* + * report the blocks in 512byte units + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + + /* Update attrtimeo value if we're out of the unstable period */ + if (invalid & NFS_INO_INVALID_ATTR) { + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + } else { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + } + } + invalid &= ~NFS_INO_INVALID_ATTR; + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + invalid &= ~NFS_INO_INVALID_DATA; + if (!nfs_have_delegation(inode, FMODE_READ) || + (save_cache_validity & NFS_INO_REVAL_FORCED)) + nfsi->cache_validity |= invalid; + + return 0; + out_changed: + /* + * Big trouble! The inode has become a different object. + */ + printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", + __func__, inode->i_ino, inode->i_mode, fattr->mode); + out_err: + /* + * No need to worry about unhashing the dentry, as the + * lookup validation will know that the inode is bad. + * (But we fall through to invalidate the caches.) + */ + nfs_invalidate_inode(inode); + return -ESTALE; + + out_fileid: + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, + (long long)nfsi->fileid, (long long)fattr->fileid); + goto out_err; +} + + +#ifdef CONFIG_NFS_V4 + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +void nfs4_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); +} +#endif + +struct inode *nfs_alloc_inode(struct super_block *sb) +{ + struct nfs_inode *nfsi; + nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + if (!nfsi) + return NULL; + nfsi->flags = 0UL; + nfsi->cache_validity = 0UL; +#ifdef CONFIG_NFS_V3_ACL + nfsi->acl_access = ERR_PTR(-EAGAIN); + nfsi->acl_default = ERR_PTR(-EAGAIN); +#endif +#ifdef CONFIG_NFS_V4 + nfsi->nfs4_acl = NULL; +#endif /* CONFIG_NFS_V4 */ + return &nfsi->vfs_inode; +} + +static void nfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); +} + +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} + +static inline void nfs4_init_once(struct nfs_inode *nfsi) +{ +#ifdef CONFIG_NFS_V4 + INIT_LIST_HEAD(&nfsi->open_states); + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; + atomic_set(&nfsi->commits_outstanding, 0); +#endif +} + +static void init_once(void *foo) +{ + struct nfs_inode *nfsi = (struct nfs_inode *) foo; + + inode_init_once(&nfsi->vfs_inode); + INIT_LIST_HEAD(&nfsi->open_files); + INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); + INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); + INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + nfsi->npages = 0; + nfsi->ncommit = 0; + atomic_set(&nfsi->silly_count, 1); + INIT_HLIST_HEAD(&nfsi->silly_list); + init_waitqueue_head(&nfsi->waitqueue); + nfs4_init_once(nfsi); +} + +static int __init nfs_init_inodecache(void) +{ + nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", + sizeof(struct nfs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (nfs_inode_cachep == NULL) + return -ENOMEM; + + return 0; +} + +static void nfs_destroy_inodecache(void) +{ + kmem_cache_destroy(nfs_inode_cachep); +} + +struct workqueue_struct *nfsiod_workqueue; + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + +/* + * Initialize NFS + */ +static int __init init_nfs_fs(void) +{ + int err; + + err = nfs_idmap_init(); + if (err < 0) + goto out9; + + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + + err = nfs_fscache_register(); + if (err < 0) + goto out7; + + err = nfsiod_start(); + if (err) + goto out6; + + err = nfs_fs_proc_init(); + if (err) + goto out5; + + err = nfs_init_nfspagecache(); + if (err) + goto out4; + + err = nfs_init_inodecache(); + if (err) + goto out3; + + err = nfs_init_readpagecache(); + if (err) + goto out2; + + err = nfs_init_writepagecache(); + if (err) + goto out1; + + err = nfs_init_directcache(); + if (err) + goto out0; + +#ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); +#endif + if ((err = register_nfs_fs()) != 0) + goto out; + return 0; +out: +#ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); +#endif + nfs_destroy_directcache(); +out0: + nfs_destroy_writepagecache(); +out1: + nfs_destroy_readpagecache(); +out2: + nfs_destroy_inodecache(); +out3: + nfs_destroy_nfspagecache(); +out4: + nfs_fs_proc_exit(); +out5: + nfsiod_stop(); +out6: + nfs_fscache_unregister(); +out7: + nfs_dns_resolver_destroy(); +out8: + nfs_idmap_quit(); +out9: + return err; +} + +static void __exit exit_nfs_fs(void) +{ + nfs_destroy_directcache(); + nfs_destroy_writepagecache(); + nfs_destroy_readpagecache(); + nfs_destroy_inodecache(); + nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); + nfs_idmap_quit(); +#ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); +#endif + nfs_cleanup_cb_ident_idr(); + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +} + +/* Not quite true; I just maintain it */ +MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_LICENSE("GPL"); +module_param(enable_ino64, bool, 0644); + +module_init(init_nfs_fs) +module_exit(exit_nfs_fs) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h new file mode 100644 index 00000000000..8102db9b926 --- /dev/null +++ b/fs/nfs/internal.h @@ -0,0 +1,463 @@ +/* + * NFS internal definitions + */ + +#include "nfs4_fs.h" +#include <linux/mount.h> +#include <linux/security.h> + +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) + +struct nfs_string; + +/* Maximum number of readahead requests + * FIXME: this should really be a sysctl so that users may tune it to suit + * their needs. People that do NFS over a slow network, might for + * instance want to reduce it to something closer to 1 for improved + * interactive response. + */ +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (clp->cl_session) + return 1; +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + +struct nfs_clone_mount { + const struct super_block *sb; + const struct dentry *dentry; + struct nfs_fh *fh; + struct nfs_fattr *fattr; + char *hostname; + char *mnt_path; + struct sockaddr *addr; + size_t addrlen; + rpc_authflavor_t authflavor; +}; + +/* + * Note: RFC 1813 doesn't limit the number of auth flavors that + * a server can return, so make something up. + */ +#define NFS_MAX_SECFLAVORS (12) + +/* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + +/* + * In-kernel mount arguments + */ +struct nfs_parsed_mount_data { + int flags; + int rsize, wsize; + int timeo, retrans; + int acregmin, acregmax, + acdirmin, acdirmax; + int namlen; + unsigned int options; + unsigned int bsize; + unsigned int auth_flavor_len; + rpc_authflavor_t auth_flavors[1]; + char *client_address; + unsigned int version; + unsigned int minorversion; + char *fscache_uniq; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + u32 version; + int port; + unsigned short protocol; + } mount_server; + + struct { + struct sockaddr_storage address; + size_t addrlen; + char *hostname; + char *export_path; + int port; + unsigned short protocol; + } nfs_server; + + struct security_mnt_opts lsm_opts; +}; + +/* mount_clnt.c */ +struct nfs_mount_request { + struct sockaddr *sap; + size_t salen; + char *hostname; + char *dirpath; + u32 version; + unsigned short protocol; + struct nfs_fh *fh; + int noresvport; + unsigned int *auth_flav_len; + rpc_authflavor_t *auth_flavs; +}; + +extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); + +/* client.c */ +extern struct rpc_program nfs_program; + +extern void nfs_cleanup_cb_ident_idr(void); +extern void nfs_put_client(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); +extern struct nfs_client *nfs4_find_client_ident(int); +extern struct nfs_client * +nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); +extern struct nfs_server *nfs_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, + struct nfs_fh *); +extern void nfs_free_server(struct nfs_server *server); +extern struct nfs_server *nfs_clone_server(struct nfs_server *, + struct nfs_fh *, + struct nfs_fattr *); +extern void nfs_mark_client_ready(struct nfs_client *clp, int state); +extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto); +#ifdef CONFIG_PROC_FS +extern int __init nfs_fs_proc_init(void); +extern void nfs_fs_proc_exit(void); +#else +static inline int nfs_fs_proc_init(void) +{ + return 0; +} +static inline void nfs_fs_proc_exit(void) +{ +} +#endif + +/* nfs4namespace.c */ +#ifdef CONFIG_NFS_V4 +extern struct vfsmount *nfs_do_refmount(struct dentry *dentry); +#else +static inline +struct vfsmount *nfs_do_refmount(struct dentry *dentry) +{ + return ERR_PTR(-ENOENT); +} +#endif + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +/* pagelist.c */ +extern int __init nfs_init_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); +extern int __init nfs_init_readpagecache(void); +extern void nfs_destroy_readpagecache(void); +extern int __init nfs_init_writepagecache(void); +extern void nfs_destroy_writepagecache(void); + +extern int __init nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); + +/* nfs2xdr.c */ +extern int nfs_stat_to_errno(enum nfs_stat); +extern struct rpc_procinfo nfs_procedures[]; +extern int nfs2_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs3xdr.c */ +extern struct rpc_procinfo nfs3_procedures[]; +extern int nfs3_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); + +/* nfs4xdr.c */ +#ifdef CONFIG_NFS_V4 +extern int nfs4_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); +#endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +#endif + +/* nfs4proc.c */ +#ifdef CONFIG_NFS_V4 +extern struct rpc_procinfo nfs4_procedures[]; +void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *); +#endif + +extern int nfs4_init_ds_session(struct nfs_client *clp); + +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern int nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport); + +/* dir.c */ +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc); + +/* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; +extern struct inode *nfs_alloc_inode(struct super_block *sb); +extern void nfs_destroy_inode(struct inode *); +extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern void nfs_evict_inode(struct inode *); +#ifdef CONFIG_NFS_V4 +extern void nfs4_evict_inode(struct inode *); +#endif +void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(void *word); + +/* super.c */ +extern struct file_system_type nfs_xdev_fs_type; +#ifdef CONFIG_NFS_V4 +extern struct file_system_type nfs4_xdev_fs_type; +extern struct file_system_type nfs4_referral_fs_type; +#endif + +extern struct rpc_stat nfs_rpcstat; + +extern int __init register_nfs_fs(void); +extern void __exit unregister_nfs_fs(void); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); + +/* namespace.c */ +extern char *nfs_path(char **p, struct dentry *dentry, + char *buffer, ssize_t buflen); +extern struct vfsmount *nfs_d_automount(struct path *path); +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +#endif + +/* getroot.c */ +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, + const char *); +#ifdef CONFIG_NFS_V4 +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, + const char *); + +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); +#endif + +struct nfs_pageio_descriptor; +/* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); +extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, + struct list_head *head); + +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_readdata_release(struct nfs_read_data *rdata); + +/* write.c */ +extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, + struct list_head *head); +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_writedata_release(struct nfs_write_data *wdata); +extern void nfs_commit_free(struct nfs_write_data *p); +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg); +void nfs_commit_clear_lock(struct nfs_inode *nfsi); +void nfs_commitdata_release(void *data); +void nfs_commit_release_pages(struct nfs_write_data *data); + +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *, enum migrate_mode); +#else +#define nfs_migrate_page NULL +#endif + +/* nfs4proc.c */ +extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); +extern int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport); +extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); +extern int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); +extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + +/* + * Determine the device name as a string + */ +static inline char *nfs_devname(struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + char *dummy; + return nfs_path(&dummy, dentry, buffer, buflen); +} + +/* + * Determine the actual block size (and log2 thereof) + */ +static inline +unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) +{ + /* make sure blocksize is a power of two */ + if ((bsize & (bsize - 1)) || nrbitsp) { + unsigned char nrbits; + + for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + ; + bsize = 1 << nrbits; + if (nrbitsp) + *nrbitsp = nrbits; + } + + return bsize; +} + +/* + * Calculate the number of 512byte blocks used. + */ +static inline blkcnt_t nfs_calc_block_size(u64 tsize) +{ + blkcnt_t used = (tsize + 511) >> 9; + return (used > ULONG_MAX) ? ULONG_MAX : used; +} + +/* + * Compute and set NFS server blocksize + */ +static inline +unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) +{ + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; + + return nfs_block_bits(bsize, nrbitsp); +} + +/* + * Determine the maximum file size for a superblock + */ +static inline +void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) +{ + sb->s_maxbytes = (loff_t)maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) + sb->s_maxbytes = MAX_LFS_FILESIZE; +} + +/* + * Determine the number of bytes of data the page contains + */ +static inline +unsigned int nfs_page_length(struct page *page) +{ + loff_t i_size = i_size_read(page->mapping->host); + + if (i_size > 0) { + pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (page->index < end_index) + return PAGE_CACHE_SIZE; + if (page->index == end_index) + return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; + } + return 0; +} + +/* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* + * Determine the number of pages in an array of length 'len' and + * with a base offset of 'base' + */ +static inline +unsigned int nfs_page_array_len(unsigned int base, size_t len) +{ + return ((unsigned long)len + (unsigned long)base + + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h new file mode 100644 index 00000000000..c5832487c45 --- /dev/null +++ b/fs/nfs/iostat.h @@ -0,0 +1,71 @@ +/* + * linux/fs/nfs/iostat.h + * + * Declarations for NFS client per-mount statistics + * + * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com> + * + */ + +#ifndef _NFS_IOSTAT +#define _NFS_IOSTAT + +#include <linux/percpu.h> +#include <linux/cache.h> +#include <linux/nfs_iostat.h> + +struct nfs_iostats { + unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif + unsigned long events[__NFSIOS_COUNTSMAX]; +} ____cacheline_aligned; + +static inline void nfs_inc_server_stats(const struct nfs_server *server, + enum nfs_stat_eventcounters stat) +{ + this_cpu_inc(server->io_stats->events[stat]); +} + +static inline void nfs_inc_stats(const struct inode *inode, + enum nfs_stat_eventcounters stat) +{ + nfs_inc_server_stats(NFS_SERVER(inode), stat); +} + +static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, + long addend) +{ + this_cpu_add(server->io_stats->bytes[stat], addend); +} + +static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, + long addend) +{ + nfs_add_server_stats(NFS_SERVER(inode), stat, addend); +} + +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + long addend) +{ + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); +} +#endif + +static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) +{ + return alloc_percpu(struct nfs_iostats); +} + +static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) +{ + if (stats != NULL) + free_percpu(stats); +} + +#endif /* _NFS_IOSTAT */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c new file mode 100644 index 00000000000..d4c2d6b7507 --- /dev/null +++ b/fs/nfs/mount_clnt.c @@ -0,0 +1,518 @@ +/* + * In-kernel MOUNT protocol client + * + * Copyright (C) 1997, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/uio.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs_fs.h> +#include "internal.h" + +#ifdef RPC_DEBUG +# define NFSDBG_FACILITY NFSDBG_MOUNT +#endif + +/* + * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4 + */ +#define MNTPATHLEN (1024) + +/* + * XDR data type sizes + */ +#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) +#define MNT_status_sz (1) +#define MNT_fhs_status_sz (1) +#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE)) +#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) + +/* + * XDR argument and result sizes + */ +#define MNT_enc_dirpath_sz encode_dirpath_sz +#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ + MNT_authflav3_sz) + +/* + * Defined by RFC 1094, section A.5 + */ +enum { + MOUNTPROC_NULL = 0, + MOUNTPROC_MNT = 1, + MOUNTPROC_DUMP = 2, + MOUNTPROC_UMNT = 3, + MOUNTPROC_UMNTALL = 4, + MOUNTPROC_EXPORT = 5, +}; + +/* + * Defined by RFC 1813, section 5.2 + */ +enum { + MOUNTPROC3_NULL = 0, + MOUNTPROC3_MNT = 1, + MOUNTPROC3_DUMP = 2, + MOUNTPROC3_UMNT = 3, + MOUNTPROC3_UMNTALL = 4, + MOUNTPROC3_EXPORT = 5, +}; + +static struct rpc_program mnt_program; + +/* + * Defined by OpenGroup XNFS Version 3W, chapter 8 + */ +enum mountstat { + MNT_OK = 0, + MNT_EPERM = 1, + MNT_ENOENT = 2, + MNT_EACCES = 13, + MNT_EINVAL = 22, +}; + +static struct { + u32 status; + int errno; +} mnt_errtbl[] = { + { .status = MNT_OK, .errno = 0, }, + { .status = MNT_EPERM, .errno = -EPERM, }, + { .status = MNT_ENOENT, .errno = -ENOENT, }, + { .status = MNT_EACCES, .errno = -EACCES, }, + { .status = MNT_EINVAL, .errno = -EINVAL, }, +}; + +/* + * Defined by RFC 1813, section 5.1.5 + */ +enum mountstat3 { + MNT3_OK = 0, /* no error */ + MNT3ERR_PERM = 1, /* Not owner */ + MNT3ERR_NOENT = 2, /* No such file or directory */ + MNT3ERR_IO = 5, /* I/O error */ + MNT3ERR_ACCES = 13, /* Permission denied */ + MNT3ERR_NOTDIR = 20, /* Not a directory */ + MNT3ERR_INVAL = 22, /* Invalid argument */ + MNT3ERR_NAMETOOLONG = 63, /* Filename too long */ + MNT3ERR_NOTSUPP = 10004, /* Operation not supported */ + MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */ +}; + +static struct { + u32 status; + int errno; +} mnt3_errtbl[] = { + { .status = MNT3_OK, .errno = 0, }, + { .status = MNT3ERR_PERM, .errno = -EPERM, }, + { .status = MNT3ERR_NOENT, .errno = -ENOENT, }, + { .status = MNT3ERR_IO, .errno = -EIO, }, + { .status = MNT3ERR_ACCES, .errno = -EACCES, }, + { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, }, + { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, + { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, + { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, +}; + +struct mountres { + int errno; + struct nfs_fh *fh; + unsigned int *auth_count; + rpc_authflavor_t *auth_flavors; +}; + +struct mnt_fhstatus { + u32 status; + struct nfs_fh *fh; +}; + +/** + * nfs_mount - Obtain an NFS file handle for the given host and path + * @info: pointer to mount request arguments + * + * Uses default timeout parameters specified by underlying transport. + */ +int nfs_mount(struct nfs_mount_request *info) +{ + struct mountres result = { + .fh = info->fh, + .auth_count = info->auth_flav_len, + .auth_flavors = info->auth_flavs, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = info->protocol, + .address = info->sap, + .addrsize = info->salen, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + }; + struct rpc_clnt *mnt_clnt; + int status; + + dprintk("NFS: sending MNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), + info->dirpath); + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + mnt_clnt = rpc_create(&args); + if (IS_ERR(mnt_clnt)) + goto out_clnt_err; + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; + else + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; + + status = rpc_call_sync(mnt_clnt, &msg, 0); + rpc_shutdown_client(mnt_clnt); + + if (status < 0) + goto out_call_err; + if (result.errno != 0) + goto out_mnt_err; + + dprintk("NFS: MNT request succeeded\n"); + status = 0; + +out: + return status; + +out_clnt_err: + status = PTR_ERR(mnt_clnt); + dprintk("NFS: failed to create MNT RPC client, status=%d\n", status); + goto out; + +out_call_err: + dprintk("NFS: MNT request failed, status=%d\n", status); + goto out; + +out_mnt_err: + dprintk("NFS: MNT server returned result %d\n", result.errno); + status = result.errno; + goto out; +} + +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + +/* + * XDR encode/decode functions for MOUNT + */ + +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +{ + const u32 pathname_len = strlen(pathname); + __be32 *p; + + BUG_ON(pathname_len > MNTPATHLEN); + p = xdr_reserve_space(xdr, 4 + pathname_len); + xdr_encode_opaque(p, pathname, pathname_len); +} + +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, + const char *dirpath) +{ + encode_mntdirpath(xdr, dirpath); +} + +/* + * RFC 1094: "A non-zero status indicates some sort of error. In this + * case, the status is a UNIX error number." This can be problematic + * if the server and client use different errno values for the same + * error. + * + * However, the OpenGroup XNFS spec provides a simple mapping that is + * independent of local errno values on the server and the client. + */ +static int decode_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { + if (mnt_errtbl[i].status == status) { + res->errno = mnt_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +} + +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + return decode_fhandle(xdr, res); +} + +static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { + if (mnt3_errtbl[i].status == status) { + res->errno = mnt3_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT3 status code: %u\n", status); + res->errno = -EACCES; + return 0; +} + +static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res) +{ + struct nfs_fh *fh = res->fh; + u32 size; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + size = be32_to_cpup(p); + if (size > NFS3_FHSIZE || size == 0) + return -EIO; + + p = xdr_inline_decode(xdr, size); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = size; + memcpy(fh->data, p, size); + return 0; +} + +static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) +{ + rpc_authflavor_t *flavors = res->auth_flavors; + unsigned int *count = res->auth_count; + u32 entries, i; + __be32 *p; + + if (*count == 0) + return 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + entries = be32_to_cpup(p); + dprintk("NFS: received %u auth flavors\n", entries); + if (entries > NFS_MAX_SECFLAVORS) + entries = NFS_MAX_SECFLAVORS; + + p = xdr_inline_decode(xdr, 4 * entries); + if (unlikely(p == NULL)) + return -EIO; + + if (entries > *count) + entries = *count; + + for (i = 0; i < entries; i++) { + flavors[i] = be32_to_cpup(p++); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); + } + *count = i; + + return 0; +} + +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_fhs_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + status = decode_fhandle3(xdr, res); + if (unlikely(status != 0)) { + res->errno = -EBADHANDLE; + return 0; + } + return decode_auth_flavors(xdr, res); +} + +static struct rpc_procinfo mnt_procedures[] = { + [MOUNTPROC_MNT] = { + .p_proc = MOUNTPROC_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres_sz, + .p_statidx = MOUNTPROC_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, +}; + +static struct rpc_procinfo mnt3_procedures[] = { + [MOUNTPROC3_MNT] = { + .p_proc = MOUNTPROC3_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres3_sz, + .p_statidx = MOUNTPROC3_MNT, + .p_name = "MOUNT", + }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, +}; + + +static struct rpc_version mnt_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(mnt_procedures), + .procs = mnt_procedures, +}; + +static struct rpc_version mnt_version3 = { + .number = 3, + .nrprocs = ARRAY_SIZE(mnt3_procedures), + .procs = mnt3_procedures, +}; + +static struct rpc_version *mnt_version[] = { + NULL, + &mnt_version1, + NULL, + &mnt_version3, +}; + +static struct rpc_stat mnt_stats; + +static struct rpc_program mnt_program = { + .name = "mount", + .number = NFS_MNT_PROGRAM, + .nrvers = ARRAY_SIZE(mnt_version), + .version = mnt_version, + .stats = &mnt_stats, +}; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c new file mode 100644 index 00000000000..8102391bb37 --- /dev/null +++ b/fs/nfs/namespace.c @@ -0,0 +1,371 @@ +/* + * linux/fs/nfs/namespace.c + * + * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com> + * - Modified by David Howells <dhowells@redhat.com> + * + * NFS namespace + */ + +#include <linux/dcache.h> +#include <linux/gfp.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/nfs_fs.h> +#include <linux/string.h> +#include <linux/sunrpc/clnt.h> +#include <linux/vfs.h> +#include <linux/sunrpc/gss_api.h> +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static void nfs_expire_automounts(struct work_struct *work); + +static LIST_HEAD(nfs_automount_list); +static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); +int nfs_mountpoint_expiry_timeout = 500 * HZ; + +static struct vfsmount *nfs_do_submount(struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor); + +/* + * nfs_path - reconstruct the path given an arbitrary dentry + * @base - used to return pointer to the end of devname part of path + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer + * + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. + * + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. + */ +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) +{ + char *end; + int namelen; + unsigned seq; + const char *base; + +rename_retry: + end = buffer+buflen; + *--end = '\0'; + buflen--; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + while (1) { + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) + break; + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong_unlock; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + spin_unlock(&dentry->d_lock); + dentry = dentry->d_parent; + } + if (read_seqretry(&rename_lock, seq)) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + if (*end != '/') { + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + *--end = '/'; + } + *p = end; + base = dentry->d_fsdata; + if (!base) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + WARN_ON(1); + return end; + } + namelen = strlen(base); + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + buflen -= namelen; + if (buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + end -= namelen; + memcpy(end, base, namelen); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + return end; +Elong_unlock: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + +static int nfs_negotiate_security(const struct dentry *parent, + const struct dentry *dentry, + rpc_authflavor_t *flavor) +{ + struct page *page; + struct nfs4_secinfo_flavors *flavors; + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); + int ret = -EPERM; + + secinfo = NFS_PROTO(parent->d_inode)->secinfo; + if (secinfo != NULL) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + flavors = page_address(page); + ret = secinfo(parent->d_inode, &dentry->d_name, flavors); + *flavor = nfs_find_best_sec(flavors); + put_page(page); + } + +out: + return ret; +} + +static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent, + struct dentry *dentry, struct path *path, + struct nfs_fh *fh, struct nfs_fattr *fattr, + rpc_authflavor_t *flavor) +{ + struct rpc_clnt *clone; + struct rpc_auth *auth; + int err; + + err = nfs_negotiate_security(parent, path->dentry, flavor); + if (err < 0) + goto out; + clone = rpc_clone_client(server->client); + auth = rpcauth_create(*flavor, clone); + if (!auth) { + err = -EIO; + goto out_shutdown; + } + err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode, + &path->dentry->d_name, + fh, fattr); +out_shutdown: + rpc_shutdown_client(clone); +out: + return err; +} +#else /* CONFIG_NFS_V4 */ +static inline int nfs_lookup_with_sec(struct nfs_server *server, + struct dentry *parent, struct dentry *dentry, + struct path *path, struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t *flavor) +{ + return -EPERM; +} +#endif /* CONFIG_NFS_V4 */ + +/* + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint + * + * When we encounter a mountpoint on the server, we want to set up + * a mountpoint on the client too, to prevent inode numbers from + * colliding, and to allow "df" to work properly. + * On NFSv4, we also want to allow for the fact that different + * filesystems may be migrated to different servers in a failover + * situation, and that different filesystems may want to use + * different security flavours. + */ +struct vfsmount *nfs_d_automount(struct path *path) +{ + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct dentry *parent; + struct nfs_fh *fh = NULL; + struct nfs_fattr *fattr = NULL; + int err; + rpc_authflavor_t flavor = RPC_AUTH_UNIX; + + dprintk("--> nfs_d_automount()\n"); + + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; + + mnt = ERR_PTR(-ENOMEM); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fh == NULL || fattr == NULL) + goto out; + + dprintk("%s: enter\n", __func__); + + /* Look it up again to get its attributes */ + parent = dget_parent(path->dentry); + err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, + &path->dentry->d_name, + fh, fattr); + if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL) + err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor); + dput(parent); + if (err != 0) { + mnt = ERR_PTR(err); + goto out; + } + + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(path->dentry); + else + mnt = nfs_do_submount(path->dentry, fh, fattr, flavor); + if (IS_ERR(mnt)) + goto out; + + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out_nofree: + dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); + return mnt; +} + +const struct inode_operations nfs_mountpoint_inode_operations = { + .getattr = nfs_getattr, +}; + +const struct inode_operations nfs_referral_inode_operations = { +}; + +static void nfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &nfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +} + +void nfs_release_automount_timer(void) +{ + if (list_empty(&nfs_automount_list)) + cancel_delayed_work(&nfs_automount_task); +} + +/* + * Clone a mountpoint of the appropriate type + */ +static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, + const char *devname, + struct nfs_clone_mount *mountdata) +{ +#ifdef CONFIG_NFS_V4 + struct vfsmount *mnt = ERR_PTR(-EINVAL); + switch (server->nfs_client->rpc_ops->version) { + case 2: + case 3: + mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); + break; + case 4: + mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); + } + return mnt; +#else + return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); +#endif +} + +/** + * nfs_do_submount - set up mountpoint when crossing a filesystem boundary + * @dentry - parent directory + * @fh - filehandle for new root dentry + * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount + * + */ +static struct vfsmount *nfs_do_submount(struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor) +{ + struct nfs_clone_mount mountdata = { + .sb = dentry->d_sb, + .dentry = dentry, + .fh = fh, + .fattr = fattr, + .authflavor = authflavor, + }; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + char *page = (char *) __get_free_page(GFP_USER); + char *devname; + + dprintk("--> nfs_do_submount()\n"); + + dprintk("%s: submounting on %s/%s\n", __func__, + dentry->d_parent->d_name.name, + dentry->d_name.name); + if (page == NULL) + goto out; + devname = nfs_devname(dentry, page, PAGE_SIZE); + mnt = (struct vfsmount *)devname; + if (IS_ERR(devname)) + goto free_page; + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); +free_page: + free_page((unsigned long)page); +out: + dprintk("%s: done\n", __func__); + + dprintk("<-- nfs_do_submount() = %p\n", mnt); + return mnt; +} diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c new file mode 100644 index 00000000000..792cb13a430 --- /dev/null +++ b/fs/nfs/nfs2xdr.c @@ -0,0 +1,1157 @@ +/* + * linux/fs/nfs/nfs2xdr.c + * + * XDR functions to encode/decode NFS RPC arguments and results. + * + * Copyright (C) 1992, 1993, 1994 Rick Sladkey + * Copyright (C) 1996 Olaf Kirch + * 04 Aug 1998 Ion Badulescu <ionut@cs.columbia.edu> + * FIFO's need special handling in NFSv2 + */ + +#include <linux/param.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/pagemap.h> +#include <linux/proc_fs.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> +#include <linux/nfs2.h> +#include <linux/nfs_fs.h> +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_XDR + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +/* + * Declare the space requirements for NFS arguments and replies as + * number of 32bit-words + */ +#define NFS_fhandle_sz (8) +#define NFS_sattr_sz (8) +#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) +#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2)) +#define NFS_fattr_sz (17) +#define NFS_info_sz (5) +#define NFS_entry_sz (NFS_filename_sz+3) + +#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_removeargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz) +#define NFS_readlinkargs_sz (NFS_fhandle_sz) +#define NFS_readargs_sz (NFS_fhandle_sz+3) +#define NFS_writeargs_sz (NFS_fhandle_sz+4) +#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz) +#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz) +#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz) +#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz) +#define NFS_readdirargs_sz (NFS_fhandle_sz+2) + +#define NFS_attrstat_sz (1+NFS_fattr_sz) +#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) +#define NFS_readlinkres_sz (2) +#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_writeres_sz (NFS_attrstat_sz) +#define NFS_stat_sz (1) +#define NFS_readdirres_sz (1) +#define NFS_statfsres_sz (1+NFS_info_sz) + + +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) +{ + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} + + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + * typedef opaque nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +{ + u32 recvd, count; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(count > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, count); + result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ + result->count = count; + return count; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * enum stat { + * NFS_OK = 0, + * NFSERR_PERM = 1, + * NFSERR_NOENT = 2, + * NFSERR_IO = 5, + * NFSERR_NXIO = 6, + * NFSERR_ACCES = 13, + * NFSERR_EXIST = 17, + * NFSERR_NODEV = 19, + * NFSERR_NOTDIR = 20, + * NFSERR_ISDIR = 21, + * NFSERR_FBIG = 27, + * NFSERR_NOSPC = 28, + * NFSERR_ROFS = 30, + * NFSERR_NAMETOOLONG = 63, + * NFSERR_NOTEMPTY = 66, + * NFSERR_DQUOT = 69, + * NFSERR_STALE = 70, + * NFSERR_WFLUSH = 99 + * }; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.2. ftype + * + * enum ftype { + * NFNON = 0, + * NFREG = 1, + * NFDIR = 2, + * NFBLK = 3, + * NFCHR = 4, + * NFLNK = 5 + * }; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type) +{ + *type = be32_to_cpup(p++); + if (unlikely(*type > NF2FIFO)) + *type = NFBAD; + return p; +} + +/* + * 2.3.3. fhandle + * + * typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) +{ + __be32 *p; + + BUG_ON(fh->size != NFS2_FHSIZE); + p = xdr_reserve_space(xdr, NFS2_FHSIZE); + memcpy(p, fh->data, NFS2_FHSIZE); +} + +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.4. timeval + * + * struct timeval { + * unsigned int seconds; + * unsigned int useconds; + * }; + */ +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + if (timep->tv_nsec != 0) + *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); + else + *p++ = cpu_to_be32(0); + return p; +} + +/* + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time". It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly. See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. + */ +static __be32 *xdr_encode_current_server_time(__be32 *p, + const struct timespec *timep) +{ + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(1000000); + return p; +} + +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; + return p; +} + +/* + * 2.3.5. fattr + * + * struct fattr { + * ftype type; + * unsigned int mode; + * unsigned int nlink; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * unsigned int blocksize; + * unsigned int rdev; + * unsigned int blocks; + * unsigned int fsid; + * unsigned int fileid; + * timeval atime; + * timeval mtime; + * timeval ctime; + * }; + * + */ +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + u32 rdev, type; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_V2; + + p = xdr_decode_ftype(p, &type); + + fattr->mode = be32_to_cpup(p++); + fattr->nlink = be32_to_cpup(p++); + fattr->uid = be32_to_cpup(p++); + fattr->gid = be32_to_cpup(p++); + fattr->size = be32_to_cpup(p++); + fattr->du.nfs2.blocksize = be32_to_cpup(p++); + + rdev = be32_to_cpup(p++); + fattr->rdev = new_decode_dev(rdev); + if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { + fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; + fattr->rdev = 0; + } + + fattr->du.nfs2.blocks = be32_to_cpup(p++); + fattr->fsid.major = be32_to_cpup(p++); + fattr->fsid.minor = 0; + fattr->fileid = be32_to_cpup(p++); + + p = xdr_decode_time(p, &fattr->atime); + p = xdr_decode_time(p, &fattr->mtime); + xdr_decode_time(p, &fattr->ctime); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.6. sattr + * + * struct sattr { + * unsigned int mode; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * timeval atime; + * timeval mtime; + * }; + */ + +#define NFS2_SATTR_NOT_SET (0xffffffff) + +static __be32 *xdr_time_not_set(__be32 *p) +{ + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + return p; +} + +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); + + if (attr->ia_valid & ATTR_MODE) + *p++ = cpu_to_be32(attr->ia_mode); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_UID) + *p++ = cpu_to_be32(attr->ia_uid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_GID) + *p++ = cpu_to_be32(attr->ia_gid); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_SIZE) + *p++ = cpu_to_be32((u32)attr->ia_size); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + + if (attr->ia_valid & ATTR_ATIME_SET) + p = xdr_encode_time(p, &attr->ia_atime); + else if (attr->ia_valid & ATTR_ATIME) + p = xdr_encode_current_server_time(p, &attr->ia_atime); + else + p = xdr_time_not_set(p); + if (attr->ia_valid & ATTR_MTIME_SET) + xdr_encode_time(p, &attr->ia_mtime); + else if (attr->ia_valid & ATTR_MTIME) + xdr_encode_current_server_time(p, &attr->ia_mtime); + else + xdr_time_not_set(p); +} + +/* + * 2.3.7. filename + * + * typedef string filename<MAXNAMLEN>; + */ +static void encode_filename(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS2_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); +} + +static int decode_filename_inline(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.8. path + * + * typedef string path<MAXPATHLEN>; + */ +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) +{ + __be32 *p; + + BUG_ON(length > NFS2_MAXPATHLEN); + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(length); + xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_path(struct xdr_stream *xdr) +{ + u32 length, recvd; + size_t hdrlen; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p); + if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) + goto out_size; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(length > recvd)) + goto out_cheating; + + xdr_read_pages(xdr, length); + xdr_terminate_string(xdr->buf, length); + return 0; +out_size: + dprintk("NFS: returned pathname too long: %u\n", length); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "length %u > received %u\n", length, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * 2.3.9. attrstat + * + * union attrstat switch (stat status) { + * case NFS_OK: + * fattr attributes; + * default: + * void; + * }; + */ +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.3.10. diropargs + * + * struct diropargs { + * fhandle dir; + * filename name; + * }; + */ +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) +{ + encode_fhandle(xdr, fh); + encode_filename(xdr, name, length); +} + +/* + * 2.3.11. diropres + * + * union diropres switch (stat status) { + * case NFS_OK: + * struct { + * fhandle file; + * fattr attributes; + * } diropok; + * default: + * void; + * }; + */ +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + int error; + + error = decode_fhandle(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_fattr(xdr, result->fattr); +out: + return error; +} + +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_diropok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_fhandle(xdr, fh); +} + +/* + * 2.2.3. sattrargs + * + * struct sattrargs { + * fhandle file; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_sattrargs *args) +{ + encode_fhandle(xdr, args->fh); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_diropargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); +} + +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readlinkargs *args) +{ + encode_fhandle(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); +} + +/* + * 2.2.7. readargs + * + * struct readargs { + * fhandle file; + * unsigned offset; + * unsigned count; + * unsigned totalcount; + * }; + */ +static void encode_readargs(struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + *p = cpu_to_be32(count); +} + +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readargs *args) +{ + encode_readargs(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; +} + +/* + * 2.2.9. writeargs + * + * struct writeargs { + * fhandle file; + * unsigned beginoffset; + * unsigned offset; + * unsigned totalcount; + * nfsdata data; + * }; + */ +static void encode_writeargs(struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + + /* nfsdata */ + *p = cpu_to_be32(count); + xdr_write_pages(xdr, args->pages, args->pgbase, count); +} + +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_writeargs *args) +{ + encode_writeargs(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; +} + +/* + * 2.2.10. createargs + * + * struct createargs { + * diropargs where; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_createargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs(xdr, args->fh, args->name.name, args->name.len); +} + +/* + * 2.2.12. renameargs + * + * struct renameargs { + * diropargs from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) +{ + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; + + encode_diropargs(xdr, args->old_dir, old->name, old->len); + encode_diropargs(xdr, args->new_dir, new->name, new->len); +} + +/* + * 2.2.13. linkargs + * + * struct linkargs { + * fhandle from; + * diropargs to; + * }; + */ +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_linkargs *args) +{ + encode_fhandle(xdr, args->fromfh); + encode_diropargs(xdr, args->tofh, args->toname, args->tolen); +} + +/* + * 2.2.14. symlinkargs + * + * struct symlinkargs { + * diropargs from; + * path to; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_symlinkargs *args) +{ + encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); + encode_path(xdr, args->pages, args->pathlen); + encode_sattr(xdr, args->sattr); +} + +/* + * 2.2.17. readdirargs + * + * struct readdirargs { + * fhandle dir; + * nfscookie cookie; + * unsigned count; + * }; + */ +static void encode_readdirargs(struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + __be32 *p; + + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(args->cookie); + *p = cpu_to_be32(args->count); +} + +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + encode_readdirargs(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS_readdirres_sz); +} + +/* + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + return decode_attrstat(xdr, result); +} + +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_diropok *result) +{ + return decode_diropres(xdr, result); +} + +/* + * 2.2.6. readlinkres + * + * union readlinkres switch (stat status) { + * case NFS_OK: + * path data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_path(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.7. readres + * + * union readres switch (stat status) { + * case NFS_OK: + * fattr attributes; + * nfsdata data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_readres *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_nfsdata(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_writeres *result) +{ + /* All NFSv2 writes are "file sync" writes */ + result->verf->committed = NFS_FILE_SYNC; + return decode_attrstat(xdr, result->fattr); +} + +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17. entry + * + * struct entry { + * unsigned fileid; + * filename name; + * nfscookie cookie; + * entry *nextentry; + * }; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->ino = be32_to_cpup(p); + + error = decode_filename_inline(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + /* + * The type (size and byte order) of nfscookie isn't defined in + * RFC 1094. This implementation assumes that it's an XDR uint32. + */ + entry->prev_cookie = entry->cookie; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->cookie = be32_to_cpup(p); + + entry->d_type = DT_UNKNOWN; + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +} + +/* + * 2.2.17. readdirres + * + * union readdirres switch (stat status) { + * case NFS_OK: + * struct { + * entry *entries; + * bool eof; + * } readdirok; + * default: + * void; + * }; + * + * Read the directory contents into the page cache, but don't + * touch them. The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls. + */ +static int decode_readdirok(struct xdr_stream *xdr) +{ + u32 recvd, pglen; + size_t hdrlen; + + pglen = xdr->buf->page_len; + hdrlen = (u8 *)xdr->p - (u8 *)xdr->iov->iov_base; + recvd = xdr->buf->len - hdrlen; + if (unlikely(pglen > recvd)) + goto out_cheating; +out: + xdr_read_pages(xdr, pglen); + return pglen; +out_cheating: + dprintk("NFS: server cheating in readdir result: " + "pglen %u > recvd %u\n", pglen, recvd); + pglen = recvd; + goto out; +} + +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_readdirok(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +/* + * 2.2.18. statfsres + * + * union statfsres (stat status) { + * case NFS_OK: + * struct { + * unsigned tsize; + * unsigned bsize; + * unsigned blocks; + * unsigned bfree; + * unsigned bavail; + * } info; + * default: + * void; + * }; + */ +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_info_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + result->tsize = be32_to_cpup(p++); + result->bsize = be32_to_cpup(p++); + result->blocks = be32_to_cpup(p++); + result->bfree = be32_to_cpup(p++); + result->bavail = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs2_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_info(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +int nfs_stat_to_errno(enum nfs_stat status) +{ + int i; + + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; +} + +#define PROC(proc, argtype, restype, timer) \ +[NFSPROC_##proc] = { \ + .p_proc = NFSPROC_##proc, \ + .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \ + .p_arglen = NFS_##argtype##_sz, \ + .p_replen = NFS_##restype##_sz, \ + .p_timer = timer, \ + .p_statidx = NFSPROC_##proc, \ + .p_name = #proc, \ + } +struct rpc_procinfo nfs_procedures[] = { + PROC(GETATTR, fhandle, attrstat, 1), + PROC(SETATTR, sattrargs, attrstat, 0), + PROC(LOOKUP, diropargs, diropres, 2), + PROC(READLINK, readlinkargs, readlinkres, 3), + PROC(READ, readargs, readres, 3), + PROC(WRITE, writeargs, writeres, 4), + PROC(CREATE, createargs, diropres, 0), + PROC(REMOVE, removeargs, stat, 0), + PROC(RENAME, renameargs, stat, 0), + PROC(LINK, linkargs, stat, 0), + PROC(SYMLINK, symlinkargs, stat, 0), + PROC(MKDIR, createargs, diropres, 0), + PROC(RMDIR, diropargs, stat, 0), + PROC(READDIR, readdirargs, readdirres, 3), + PROC(STATFS, fhandle, statfsres, 0), +}; + +struct rpc_version nfs_version2 = { + .number = 2, + .nrprocs = ARRAY_SIZE(nfs_procedures), + .procs = nfs_procedures +}; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c new file mode 100644 index 00000000000..7ef23979896 --- /dev/null +++ b/fs/nfs/nfs3acl.c @@ -0,0 +1,440 @@ +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/nfs.h> +#include <linux/nfs3.h> +#include <linux/nfs_fs.h> +#include <linux/posix_acl_xattr.h> +#include <linux/nfsacl.h> + +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int pos=0, len=0; + +# define output(s) do { \ + if (pos + sizeof(s) <= size) { \ + memcpy(buffer + pos, s, sizeof(s)); \ + pos += sizeof(s); \ + } \ + len += sizeof(s); \ + } while(0) + + acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_access"); + posix_acl_release(acl); + } + + if (S_ISDIR(inode->i_mode)) { + acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_default"); + posix_acl_release(acl); + } + } + +# undef output + + if (!buffer || len <= size) + return len; + return -ERANGE; +} + +ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error = 0; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = nfs3_proc_getacl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + if (type == ACL_TYPE_ACCESS && acl->a_count == 0) + error = -ENODATA; + else + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + } else + error = -ENODATA; + + return error; +} + +int nfs3_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + error = nfs3_proc_setacl(inode, type, acl); + posix_acl_release(acl); + + return error; +} + +int nfs3_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int type; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + return nfs3_proc_setacl(inode, type, NULL); +} + +static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) +{ + if (!IS_ERR(nfsi->acl_access)) { + posix_acl_release(nfsi->acl_access); + nfsi->acl_access = ERR_PTR(-EAGAIN); + } + if (!IS_ERR(nfsi->acl_default)) { + posix_acl_release(nfsi->acl_default); + nfsi->acl_default = ERR_PTR(-EAGAIN); + } +} + +void nfs3_forget_cached_acls(struct inode *inode) +{ + dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, + inode->i_ino); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct posix_acl *acl = ERR_PTR(-EINVAL); + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + acl = nfsi->acl_access; + break; + + case ACL_TYPE_DEFAULT: + acl = nfsi->acl_default; + break; + + default: + goto out; + } + if (IS_ERR(acl)) + acl = ERR_PTR(-EAGAIN); + else + acl = posix_acl_dup(acl); +out: + spin_unlock(&inode->i_lock); + dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, + inode->i_ino, type, acl); + return acl; +} + +static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, + inode->i_ino, acl, dfacl); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + if (!IS_ERR(acl)) + nfsi->acl_access = posix_acl_dup(acl); + if (!IS_ERR(dfacl)) + nfsi->acl_default = posix_acl_dup(dfacl); + spin_unlock(&inode->i_lock); +} + +struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), + /* The xdr layer may allocate pages here. */ + .pages = pages, + }; + struct nfs3_getaclres res = { + 0 + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct posix_acl *acl; + int status, count; + + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + return ERR_PTR(-EOPNOTSUPP); + + status = nfs_revalidate_inode(server, inode); + if (status < 0) + return ERR_PTR(status); + acl = nfs3_get_cached_acl(inode, type); + if (acl != ERR_PTR(-EAGAIN)) + return acl; + acl = NULL; + + /* + * Only get the access acl when explicitly requested: We don't + * need it for access decisions, and only some applications use + * it. Applications which request the access acl first are not + * penalized from this optimization. + */ + if (type == ACL_TYPE_ACCESS) + args.mask |= NFS_ACLCNT|NFS_ACL; + if (S_ISDIR(inode->i_mode)) + args.mask |= NFS_DFACLCNT|NFS_DFACL; + if (args.mask == 0) + return NULL; + + dprintk("NFS call getacl\n"); + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return ERR_PTR(-ENOMEM); + + status = rpc_call_sync(server->client_acl, &msg, 0); + dprintk("NFS reply getacl: %d\n", status); + + /* pages may have been allocated at the xdr layer. */ + for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) + __free_page(args.pages[count]); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, res.fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL extension not supported; disabling\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + default: + goto getout; + } + if ((args.mask & res.mask) != args.mask) { + status = -EIO; + goto getout; + } + + if (res.acl_access != NULL) { + if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + posix_acl_release(res.acl_access); + res.acl_access = NULL; + } + } + nfs3_cache_acls(inode, + (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), + (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); + + switch(type) { + case ACL_TYPE_ACCESS: + acl = res.acl_access; + res.acl_access = NULL; + break; + + case ACL_TYPE_DEFAULT: + acl = res.acl_default; + res.acl_default = NULL; + } + +getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); + nfs_free_fattr(res.fattr); + + if (status != 0) { + posix_acl_release(acl); + acl = ERR_PTR(status); + } + return acl; +} + +static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; + struct nfs3_setaclargs args = { + .inode = inode, + .mask = NFS_ACL, + .acl_access = acl, + .pages = pages, + }; + struct rpc_message msg = { + .rpc_argp = &args, + .rpc_resp = &fattr, + }; + int status; + + status = -EOPNOTSUPP; + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + goto out; + + /* We are doing this here because XDR marshalling does not + * return any results, it BUGs. */ + status = -ENOSPC; + if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (S_ISDIR(inode->i_mode)) { + args.mask |= NFS_DFACL; + args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); + } + + dprintk("NFS call setacl\n"); + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out_freepages; + + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + msg.rpc_resp = fattr; + status = rpc_call_sync(server->client_acl, &msg, 0); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); + dprintk("NFS reply setacl: %d\n", status); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, fattr); + nfs3_cache_acls(inode, acl, dfacl); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL SETACL RPC not supported" + "(will not retry)\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + } + nfs_free_fattr(fattr); +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } +out: + return status; +} + +int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl *alloc = NULL, *dfacl = NULL; + int status; + + if (S_ISDIR(inode->i_mode)) { + switch(type) { + case ACL_TYPE_ACCESS: + alloc = dfacl = nfs3_proc_getacl(inode, + ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = nfs3_proc_getacl(inode, + ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; + + default: + return -EINVAL; + } + } else if (type != ACL_TYPE_ACCESS) + return -EINVAL; + + if (acl == NULL) { + alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(alloc)) + goto fail; + } + status = nfs3_proc_setacls(inode, acl, dfacl); + posix_acl_release(alloc); + return status; + +fail: + return PTR_ERR(alloc); +} + +int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, + umode_t mode) +{ + struct posix_acl *dfacl, *acl; + int error = 0; + + dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(dfacl)) { + error = PTR_ERR(dfacl); + return (error == -EOPNOTSUPP) ? 0 : error; + } + if (!dfacl) + return 0; + acl = posix_acl_dup(dfacl); + error = posix_acl_create(&acl, GFP_KERNEL, &mode); + if (error < 0) + goto out_release_dfacl; + error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? + dfacl : NULL); + posix_acl_release(acl); +out_release_dfacl: + posix_acl_release(dfacl); + return error; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c new file mode 100644 index 00000000000..91943953a37 --- /dev/null +++ b/fs/nfs/nfs3proc.c @@ -0,0 +1,891 @@ +/* + * linux/fs/nfs/nfs3proc.c + * + * Client-side NFSv3 procedures stubs. + * + * Copyright (C) 1997, Olaf Kirch + */ + +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/sunrpc/clnt.h> +#include <linux/slab.h> +#include <linux/nfs.h> +#include <linux/nfs3.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/lockd/bind.h> +#include <linux/nfs_mount.h> +#include <linux/freezer.h> + +#include "iostat.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PROC + +/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ +static int +nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EJUKEBOX && res != -EKEYEXPIRED) + break; + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs3_rpc_wrapper(clnt, msg, flags) + +static int +nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) +{ + if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) + return 0; + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +static int +do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_FSINFO], + .rpc_argp = fhandle, + .rpc_resp = info, + }; + int status; + + dprintk("%s: call fsinfo\n", __func__); + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply fsinfo: %d\n", __func__, status); + if (!(info->fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_resp = info->fattr; + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); + } + return status; +} + +/* + * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb + */ +static int +nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_get_root(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + +/* + * One function for each procedure in the NFS protocol. + */ +static int +nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], + .rpc_argp = fhandle, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call getattr\n"); + nfs_fattr_init(fattr); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply getattr: %d\n", status); + return status; +} + +static int +nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, + struct iattr *sattr) +{ + struct inode *inode = dentry->d_inode; + struct nfs3_sattrargs arg = { + .fh = NFS_FH(inode), + .sattr = sattr, + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_SETATTR], + .rpc_argp = &arg, + .rpc_resp = fattr, + }; + int status; + + dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); + dprintk("NFS reply setattr: %d\n", status); + return status; +} + +static int +nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + struct nfs3_diropargs arg = { + .fh = NFS_FH(dir), + .name = name->name, + .len = name->len + }; + struct nfs3_diropres res = { + .fh = fhandle, + .fattr = fattr + }; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_LOOKUP], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + + dprintk("NFS call lookup %s\n", name->name); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + return -ENOMEM; + + nfs_fattr_init(fattr); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_refresh_inode(dir, res.dir_attr); + if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_argp = fhandle; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + } + nfs_free_fattr(res.dir_attr); + dprintk("NFS reply lookup: %d\n", status); + return status; +} + +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +{ + struct nfs3_accessargs arg = { + .fh = NFS_FH(inode), + }; + struct nfs3_accessres res; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = entry->cred, + }; + int mode = entry->mask; + int status = -ENOMEM; + + dprintk("NFS call access\n"); + + if (mode & MAY_READ) + arg.access |= NFS3_ACCESS_READ; + if (S_ISDIR(inode->i_mode)) { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_LOOKUP; + } else { + if (mode & MAY_WRITE) + arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; + if (mode & MAY_EXEC) + arg.access |= NFS3_ACCESS_EXECUTE; + } + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, res.fattr); + if (status == 0) { + entry->mask = 0; + if (res.access & NFS3_ACCESS_READ) + entry->mask |= MAY_READ; + if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; + } + nfs_free_fattr(res.fattr); +out: + dprintk("NFS reply access: %d\n", status); + return status; +} |