summaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
authorAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
committerAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
commitb4b6116a13633898cf868f2f103c96a90c4c20f8 (patch)
tree93d1b7e2cfcdf473d8d4ff3ad141fa864f8491f6 /Documentation/filesystems
parentedd4be777c953e5faafc80d091d3084b4343f5d3 (diff)
downloadkernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.gz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.xz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.zip
fedora kernel: d9aad82f3319f3cfd1aebc01234254ef0c37ad84v3.3.2-1
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX120
-rw-r--r--Documentation/filesystems/9p.txt166
-rw-r--r--Documentation/filesystems/Locking533
-rw-r--r--Documentation/filesystems/Makefile8
-rw-r--r--Documentation/filesystems/adfs.txt75
-rw-r--r--Documentation/filesystems/affs.txt219
-rw-r--r--Documentation/filesystems/afs.txt247
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt393
-rw-r--r--Documentation/filesystems/automount-support.txt118
-rw-r--r--Documentation/filesystems/befs.txt117
-rw-r--r--Documentation/filesystems/bfs.txt57
-rw-r--r--Documentation/filesystems/btrfs.txt91
-rw-r--r--Documentation/filesystems/caching/backend-api.txt658
-rw-r--r--Documentation/filesystems/caching/cachefiles.txt501
-rw-r--r--Documentation/filesystems/caching/fscache.txt443
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt813
-rw-r--r--Documentation/filesystems/caching/object.txt313
-rw-r--r--Documentation/filesystems/caching/operations.txt213
-rw-r--r--Documentation/filesystems/ceph.txt148
-rw-r--r--Documentation/filesystems/cifs.txt51
-rw-r--r--Documentation/filesystems/coda.txt1673
-rw-r--r--Documentation/filesystems/configfs/Makefile3
-rw-r--r--Documentation/filesystems/configfs/configfs.txt484
-rw-r--r--Documentation/filesystems/configfs/configfs_example_explicit.c483
-rw-r--r--Documentation/filesystems/configfs/configfs_example_macros.c446
-rw-r--r--Documentation/filesystems/cramfs.txt76
-rw-r--r--Documentation/filesystems/debugfs.txt188
-rw-r--r--Documentation/filesystems/devpts.txt132
-rw-r--r--Documentation/filesystems/directory-locking114
-rw-r--r--Documentation/filesystems/dlmfs.txt130
-rw-r--r--Documentation/filesystems/dnotify.txt70
-rw-r--r--Documentation/filesystems/dnotify_test.c34
-rw-r--r--Documentation/filesystems/ecryptfs.txt77
-rw-r--r--Documentation/filesystems/exofs.txt185
-rw-r--r--Documentation/filesystems/ext2.txt383
-rw-r--r--Documentation/filesystems/ext3.txt214
-rw-r--r--Documentation/filesystems/ext4.txt604
-rw-r--r--Documentation/filesystems/fiemap.txt228
-rw-r--r--Documentation/filesystems/files.txt123
-rw-r--r--Documentation/filesystems/fuse.txt423
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt114
-rw-r--r--Documentation/filesystems/gfs2-uevents.txt100
-rw-r--r--Documentation/filesystems/gfs2.txt46
-rw-r--r--Documentation/filesystems/hfs.txt82
-rw-r--r--Documentation/filesystems/hfsplus.txt59
-rw-r--r--Documentation/filesystems/hpfs.txt296
-rw-r--r--Documentation/filesystems/inotify.txt270
-rw-r--r--Documentation/filesystems/isofs.txt48
-rw-r--r--Documentation/filesystems/jfs.txt41
-rw-r--r--Documentation/filesystems/locks.txt68
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--Documentation/filesystems/mandatory-locking.txt171
-rw-r--r--Documentation/filesystems/ncpfs.txt12
-rw-r--r--Documentation/filesystems/nfs/00-INDEX22
-rw-r--r--Documentation/filesystems/nfs/Exporting154
-rw-r--r--Documentation/filesystems/nfs/fault_injection.txt69
-rw-r--r--Documentation/filesystems/nfs/idmapper.txt75
-rw-r--r--Documentation/filesystems/nfs/knfsd-stats.txt159
-rw-r--r--Documentation/filesystems/nfs/nfs-rdma.txt271
-rw-r--r--Documentation/filesystems/nfs/nfs.txt98
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt208
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt294
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt55
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt202
-rw-r--r--Documentation/filesystems/nilfs2.txt208
-rw-r--r--Documentation/filesystems/ntfs.txt721
-rw-r--r--Documentation/filesystems/ocfs2.txt102
-rw-r--r--Documentation/filesystems/omfs.txt106
-rw-r--r--Documentation/filesystems/path-lookup.txt382
-rw-r--r--Documentation/filesystems/pohmelfs/design_notes.txt72
-rw-r--r--Documentation/filesystems/pohmelfs/info.txt99
-rw-r--r--Documentation/filesystems/pohmelfs/network_protocol.txt227
-rw-r--r--Documentation/filesystems/porting431
-rw-r--r--Documentation/filesystems/proc.txt1586
-rw-r--r--Documentation/filesystems/quota.txt65
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt355
-rw-r--r--Documentation/filesystems/relay.txt494
-rw-r--r--Documentation/filesystems/romfs.txt186
-rw-r--r--Documentation/filesystems/seq_file.txt292
-rw-r--r--Documentation/filesystems/sharedsubtree.txt939
-rw-r--r--Documentation/filesystems/spufs.txt521
-rw-r--r--Documentation/filesystems/squashfs.txt259
-rw-r--r--Documentation/filesystems/sysfs-pci.txt120
-rw-r--r--Documentation/filesystems/sysfs-tagging.txt42
-rw-r--r--Documentation/filesystems/sysfs.txt380
-rw-r--r--Documentation/filesystems/sysv-fs.txt197
-rw-r--r--Documentation/filesystems/tmpfs.txt148
-rw-r--r--Documentation/filesystems/ubifs.txt119
-rw-r--r--Documentation/filesystems/udf.txt82
-rw-r--r--Documentation/filesystems/ufs.txt60
-rw-r--r--Documentation/filesystems/vfat.txt295
-rw-r--r--Documentation/filesystems/vfs.txt1116
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.txt793
-rw-r--r--Documentation/filesystems/xfs.txt252
-rw-r--r--Documentation/filesystems/xip.txt68
95 files changed, 25226 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
new file mode 100644
index 00000000000..8c624a18f67
--- /dev/null
+++ b/Documentation/filesystems/00-INDEX
@@ -0,0 +1,120 @@
+00-INDEX
+ - this file (info on some of the filesystems supported by linux).
+Locking
+ - info on locking rules as they pertain to Linux VFS.
+9p.txt
+ - 9p (v9fs) is an implementation of the Plan 9 remote fs protocol.
+adfs.txt
+ - info and mount options for the Acorn Advanced Disc Filing System.
+afs.txt
+ - info and examples for the distributed AFS (Andrew File System) fs.
+affs.txt
+ - info and mount options for the Amiga Fast File System.
+automount-support.txt
+ - information about filesystem automount support.
+befs.txt
+ - information about the BeOS filesystem for Linux.
+bfs.txt
+ - info for the SCO UnixWare Boot Filesystem (BFS).
+ceph.txt
+ - info for the Ceph Distributed File System
+cifs.txt
+ - description of the CIFS filesystem.
+coda.txt
+ - description of the CODA filesystem.
+configfs/
+ - directory containing configfs documentation and example code.
+cramfs.txt
+ - info on the cram filesystem for small storage (ROMs etc).
+dentry-locking.txt
+ - info on the RCU-based dcache locking model.
+directory-locking
+ - info about the locking scheme used for directory operations.
+dlmfs.txt
+ - info on the userspace interface to the OCFS2 DLM.
+dnotify.txt
+ - info about directory notification in Linux.
+dnotify_test.c
+ - example program for dnotify
+ecryptfs.txt
+ - docs on eCryptfs: stacked cryptographic filesystem for Linux.
+exofs.txt
+ - info, usage, mount options, design about EXOFS.
+ext2.txt
+ - info, mount options and specifications for the Ext2 filesystem.
+ext3.txt
+ - info, mount options and specifications for the Ext3 filesystem.
+ext4.txt
+ - info, mount options and specifications for the Ext4 filesystem.
+files.txt
+ - info on file management in the Linux kernel.
+fuse.txt
+ - info on the Filesystem in User SpacE including mount options.
+gfs2.txt
+ - info on the Global File System 2.
+hfs.txt
+ - info on the Macintosh HFS Filesystem for Linux.
+hfsplus.txt
+ - info on the Macintosh HFSPlus Filesystem for Linux.
+hpfs.txt
+ - info and mount options for the OS/2 HPFS.
+inotify.txt
+ - info on the powerful yet simple file change notification system.
+isofs.txt
+ - info and mount options for the ISO 9660 (CDROM) filesystem.
+jfs.txt
+ - info and mount options for the JFS filesystem.
+locks.txt
+ - info on file locking implementations, flock() vs. fcntl(), etc.
+logfs.txt
+ - info on the LogFS flash filesystem.
+mandatory-locking.txt
+ - info on the Linux implementation of Sys V mandatory file locking.
+ncpfs.txt
+ - info on Novell Netware(tm) filesystem using NCP protocol.
+nfs/
+ - nfs-related documentation.
+nilfs2.txt
+ - info and mount options for the NILFS2 filesystem.
+ntfs.txt
+ - info and mount options for the NTFS filesystem (Windows NT).
+ocfs2.txt
+ - info and mount options for the OCFS2 clustered filesystem.
+porting
+ - various information on filesystem porting.
+proc.txt
+ - info on Linux's /proc filesystem.
+ramfs-rootfs-initramfs.txt
+ - info on the 'in memory' filesystems ramfs, rootfs and initramfs.
+reiser4.txt
+ - info on the Reiser4 filesystem based on dancing tree algorithms.
+relay.txt
+ - info on relay, for efficient streaming from kernel to user space.
+romfs.txt
+ - description of the ROMFS filesystem.
+seq_file.txt
+ - how to use the seq_file API
+sharedsubtree.txt
+ - a description of shared subtrees for namespaces.
+spufs.txt
+ - info and mount options for the SPU filesystem used on Cell.
+sysfs-pci.txt
+ - info on accessing PCI device resources through sysfs.
+sysfs.txt
+ - info on sysfs, a ram-based filesystem for exporting kernel objects.
+sysv-fs.txt
+ - info on the SystemV/V7/Xenix/Coherent filesystem.
+tmpfs.txt
+ - info on tmpfs, a filesystem that holds all files in virtual memory.
+udf.txt
+ - info and mount options for the UDF filesystem.
+ufs.txt
+ - info on the ufs filesystem.
+vfat.txt
+ - info on using the VFAT filesystem used in Windows NT and Windows 95
+vfs.txt
+ - overview of the Virtual File System
+xfs.txt
+ - info and mount options for the XFS filesystem.
+xip.txt
+ - info on execute-in-place for file mappings.
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
new file mode 100644
index 00000000000..2c032144284
--- /dev/null
+++ b/Documentation/filesystems/9p.txt
@@ -0,0 +1,166 @@
+ v9fs: Plan 9 Resource Sharing for Linux
+ =======================================
+
+ABOUT
+=====
+
+v9fs is a Unix implementation of the Plan 9 9p remote filesystem protocol.
+
+This software was originally developed by Ron Minnich <rminnich@sandia.gov>
+and Maya Gokhale. Additional development by Greg Watson
+<gwatson@lanl.gov> and most recently Eric Van Hensbergen
+<ericvh@gmail.com>, Latchesar Ionkov <lucho@ionkov.net> and Russ Cox
+<rsc@swtch.com>.
+
+The best detailed explanation of the Linux implementation and applications of
+the 9p client is available in the form of a USENIX paper:
+ http://www.usenix.org/events/usenix05/tech/freenix/hensbergen.html
+
+Other applications are described in the following papers:
+ * XCPU & Clustering
+ http://xcpu.org/papers/xcpu-talk.pdf
+ * KVMFS: control file system for KVM
+ http://xcpu.org/papers/kvmfs.pdf
+ * CellFS: A New Programming Model for the Cell BE
+ http://xcpu.org/papers/cellfs-talk.pdf
+ * PROSE I/O: Using 9p to enable Application Partitions
+ http://plan9.escet.urjc.es/iwp9/cready/PROSE_iwp9_2006.pdf
+ * VirtFS: A Virtualization Aware File System pass-through
+ http://goo.gl/3WPDg
+
+USAGE
+=====
+
+For remote file server:
+
+ mount -t 9p 10.10.1.2 /mnt/9
+
+For Plan 9 From User Space applications (http://swtch.com/plan9)
+
+ mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
+
+For server running on QEMU host with virtio transport:
+
+ mount -t 9p -o trans=virtio <mount_tag> /mnt/9
+
+where mount_tag is the tag associated by the server to each of the exported
+mount points. Each 9P export is seen by the client as a virtio device with an
+associated "mount_tag" property. Available mount tags can be
+seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
+
+OPTIONS
+=======
+
+ trans=name select an alternative transport. Valid options are
+ currently:
+ unix - specifying a named pipe mount point
+ tcp - specifying a normal TCP/IP connection
+ fd - used passed file descriptors for connection
+ (see rfdno and wfdno)
+ virtio - connect to the next virtio channel available
+ (from QEMU with trans_virtio module)
+ rdma - connect to a specified RDMA channel
+
+ uname=name user name to attempt mount as on the remote server. The
+ server may override or ignore this value. Certain user
+ names may require authentication.
+
+ aname=name aname specifies the file tree to access when the server is
+ offering several exported file systems.
+
+ cache=mode specifies a caching policy. By default, no caches are used.
+ loose = no attempts are made at consistency,
+ intended for exclusive, read-only mounts
+ fscache = use FS-Cache for a persistent, read-only
+ cache backend.
+
+ debug=n specifies debug level. The debug level is a bitmask.
+ 0x01 = display verbose error messages
+ 0x02 = developer debug (DEBUG_CURRENT)
+ 0x04 = display 9p trace
+ 0x08 = display VFS trace
+ 0x10 = display Marshalling debug
+ 0x20 = display RPC debug
+ 0x40 = display transport debug
+ 0x80 = display allocation debug
+ 0x100 = display protocol message debug
+ 0x200 = display Fid debug
+ 0x400 = display packet debug
+ 0x800 = display fscache tracing debug
+
+ rfdno=n the file descriptor for reading with trans=fd
+
+ wfdno=n the file descriptor for writing with trans=fd
+
+ msize=n the number of bytes to use for 9p packet payload
+
+ port=n port to connect to on the remote server
+
+ noextend force legacy mode (no 9p2000.u or 9p2000.L semantics)
+
+ version=name Select 9P protocol version. Valid options are:
+ 9p2000 - Legacy mode (same as noextend)
+ 9p2000.u - Use 9P2000.u protocol
+ 9p2000.L - Use 9P2000.L protocol
+
+ dfltuid attempt to mount as a particular uid
+
+ dfltgid attempt to mount with a particular gid
+
+ afid security channel - used by Plan 9 authentication protocols
+
+ nodevmap do not map special files - represent them as normal files.
+ This can be used to share devices/named pipes/sockets between
+ hosts. This functionality will be expanded in later versions.
+
+ access there are four access modes.
+ user = if a user tries to access a file on v9fs
+ filesystem for the first time, v9fs sends an
+ attach command (Tattach) for that user.
+ This is the default mode.
+ <uid> = allows only user with uid=<uid> to access
+ the files on the mounted filesystem
+ any = v9fs does single attach and performs all
+ operations as one user
+ client = ACL based access check on the 9p client
+ side for access validation
+
+ cachetag cache tag to use the specified persistent cache.
+ cache tags for existing cache sessions can be listed at
+ /sys/fs/9p/caches. (applies only to cache=fscache)
+
+RESOURCES
+=========
+
+Protocol specifications are maintained on github:
+http://ericvh.github.com/9p-rfc/
+
+9p client and server implementations are listed on
+http://9p.cat-v.org/implementations
+
+A 9p2000.L server is being developed by LLNL and can be found
+at http://code.google.com/p/diod/
+
+There are user and developer mailing lists available through the v9fs project
+on sourceforge (http://sourceforge.net/projects/v9fs).
+
+News and other information is maintained on a Wiki.
+(http://sf.net/apps/mediawiki/v9fs/index.php).
+
+Bug reports may be issued through the kernel.org bugzilla
+(http://bugzilla.kernel.org)
+
+For more information on the Plan 9 Operating System check out
+http://plan9.bell-labs.com/plan9
+
+For information on Plan 9 from User Space (Plan 9 applications and libraries
+ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9
+
+
+STATUS
+======
+
+The 2.6 kernel support is working on PPC and x86.
+
+PLEASE USE THE KERNEL BUGZILLA TO REPORT PROBLEMS. (http://bugzilla.kernel.org)
+
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
new file mode 100644
index 00000000000..4fca82e5276
--- /dev/null
+++ b/Documentation/filesystems/Locking
@@ -0,0 +1,533 @@
+ The text below describes the locking rules for VFS-related methods.
+It is (believed to be) up-to-date. *Please*, if you change anything in
+prototypes or locking protocols - update this file. And update the relevant
+instances in the tree, don't leave that to maintainers of filesystems/devices/
+etc. At the very least, put the list of dubious cases in the end of this file.
+Don't turn it into log - maintainers of out-of-the-tree code are supposed to
+be able to use diff(1).
+ Thing currently missing here: socket operations. Alexey?
+
+--------------------------- dentry_operations --------------------------
+prototypes:
+ int (*d_revalidate)(struct dentry *, struct nameidata *);
+ int (*d_hash)(const struct dentry *, const struct inode *,
+ struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct inode *,
+ const struct dentry *, const struct inode *,
+ unsigned int, const char *, const struct qstr *);
+ int (*d_delete)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+ struct vfsmount *(*d_automount)(struct path *path);
+ int (*d_manage)(struct dentry *, bool);
+
+locking rules:
+ rename_lock ->d_lock may block rcu-walk
+d_revalidate: no no yes (ref-walk) maybe
+d_hash no no no maybe
+d_compare: yes no no maybe
+d_delete: no yes no no
+d_release: no no yes no
+d_prune: no yes no no
+d_iput: no no yes no
+d_dname: no no no no
+d_automount: no no yes no
+d_manage: no no yes (ref-walk) maybe
+
+--------------------------- inode_operations ---------------------------
+prototypes:
+ int (*create) (struct inode *,struct dentry *,umode_t, struct nameidata *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
+ata *);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*unlink) (struct inode *,struct dentry *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,umode_t);
+ int (*rmdir) (struct inode *,struct dentry *);
+ int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
+ int (*readlink) (struct dentry *, char __user *,int);
+ void * (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int, unsigned int);
+ int (*get_acl)(struct inode *, int);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
+ void (*truncate_range)(struct inode *, loff_t, loff_t);
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+
+locking rules:
+ all may block
+ i_mutex(inode)
+lookup: yes
+create: yes
+link: yes (both)
+mknod: yes
+symlink: yes
+mkdir: yes
+unlink: yes (both)
+rmdir: yes (both) (see below)
+rename: yes (all) (see below)
+readlink: no
+follow_link: no
+put_link: no
+truncate: yes (see below)
+setattr: yes
+permission: no (may not block if called in rcu-walk mode)
+get_acl: no
+getattr: no
+setxattr: yes
+getxattr: no
+listxattr: no
+removexattr: yes
+truncate_range: yes
+fiemap: no
+ Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
+victim.
+ cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
+ ->truncate() is never called directly - it's a callback, not a
+method. It's called by vmtruncate() - deprecated library function used by
+->setattr(). Locking information above applies to that call (i.e. is
+inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
+passed).
+
+See Documentation/filesystems/directory-locking for more detailed discussion
+of the locking scheme for directory operations.
+
+--------------------------- super_operations ---------------------------
+prototypes:
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *);
+ void (*dirty_inode) (struct inode *, int flags);
+ int (*write_inode) (struct inode *, struct writeback_control *wbc);
+ int (*drop_inode) (struct inode *);
+ void (*evict_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ void (*write_super) (struct super_block *);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ int (*freeze_fs) (struct super_block *);
+ int (*unfreeze_fs) (struct super_block *);
+ int (*statfs) (struct dentry *, struct kstatfs *);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*umount_begin) (struct super_block *);
+ int (*show_options)(struct seq_file *, struct dentry *);
+ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+
+locking rules:
+ All may block [not true, see below]
+ s_umount
+alloc_inode:
+destroy_inode:
+dirty_inode:
+write_inode:
+drop_inode: !!!inode->i_lock!!!
+evict_inode:
+put_super: write
+write_super: read
+sync_fs: read
+freeze_fs: read
+unfreeze_fs: read
+statfs: maybe(read) (see below)
+remount_fs: write
+umount_begin: no
+show_options: no (namespace_sem)
+quota_read: no (see below)
+quota_write: no (see below)
+bdev_try_to_free_page: no (see below)
+
+->statfs() has s_umount (shared) when called by ustat(2) (native or
+compat), but that's an accident of bad API; s_umount is used to pin
+the superblock down when we only have dev_t given us by userland to
+identify the superblock. Everything else (statfs(), fstatfs(), etc.)
+doesn't hold it when calling ->statfs() - superblock is pinned down
+by resolving the pathname passed to syscall.
+->quota_read() and ->quota_write() functions are both guaranteed to
+be the only ones operating on the quota file by the quota code (via
+dqio_sem) (unless an admin really wants to screw up something and
+writes to quota files with quotas on). For other details about locking
+see also dquot_operations section.
+->bdev_try_to_free_page is called from the ->releasepage handler of
+the block device inode. See there for more details.
+
+--------------------------- file_system_type ---------------------------
+prototypes:
+ int (*get_sb) (struct file_system_type *, int,
+ const char *, void *, struct vfsmount *);
+ struct dentry *(*mount) (struct file_system_type *, int,
+ const char *, void *);
+ void (*kill_sb) (struct super_block *);
+locking rules:
+ may block
+mount yes
+kill_sb yes
+
+->mount() returns ERR_PTR or the root dentry; its superblock should be locked
+on return.
+->kill_sb() takes a write-locked superblock, does all shutdown work on it,
+unlocks and drops the reference.
+
+--------------------------- address_space_operations --------------------------
+prototypes:
+ int (*writepage)(struct page *page, struct writeback_control *wbc);
+ int (*readpage)(struct file *, struct page *);
+ int (*sync_page)(struct page *);
+ int (*writepages)(struct address_space *, struct writeback_control *);
+ int (*set_page_dirty)(struct page *page);
+ int (*readpages)(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+ sector_t (*bmap)(struct address_space *, sector_t);
+ int (*invalidatepage) (struct page *, unsigned long);
+ int (*releasepage) (struct page *, int);
+ void (*freepage)(struct page *);
+ int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs);
+ int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
+ unsigned long *);
+ int (*migratepage)(struct address_space *, struct page *, struct page *);
+ int (*launder_page)(struct page *);
+ int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+ int (*error_remove_page)(struct address_space *, struct page *);
+
+locking rules:
+ All except set_page_dirty and freepage may block
+
+ PageLocked(page) i_mutex
+writepage: yes, unlocks (see below)
+readpage: yes, unlocks
+sync_page: maybe
+writepages:
+set_page_dirty no
+readpages:
+write_begin: locks the page yes
+write_end: yes, unlocks yes
+bmap:
+invalidatepage: yes
+releasepage: yes
+freepage: yes
+direct_IO:
+get_xip_mem: maybe
+migratepage: yes (both)
+launder_page: yes
+is_partially_uptodate: yes
+error_remove_page: yes
+
+ ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
+may be called from the request handler (/dev/loop).
+
+ ->readpage() unlocks the page, either synchronously or via I/O
+completion.
+
+ ->readpages() populates the pagecache with the passed pages and starts
+I/O against them. They come unlocked upon I/O completion.
+
+ ->writepage() is used for two purposes: for "memory cleansing" and for
+"sync". These are quite different operations and the behaviour may differ
+depending upon the mode.
+
+If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
+it *must* start I/O against the page, even if that would involve
+blocking on in-progress I/O.
+
+If writepage is called for memory cleansing (sync_mode ==
+WBC_SYNC_NONE) then its role is to get as much writeout underway as
+possible. So writepage should try to avoid blocking against
+currently-in-progress I/O.
+
+If the filesystem is not called for "sync" and it determines that it
+would need to block against in-progress I/O to be able to start new I/O
+against the page the filesystem should redirty the page with
+redirty_page_for_writepage(), then unlock the page and return zero.
+This may also be done to avoid internal deadlocks, but rarely.
+
+If the filesystem is called for sync then it must wait on any
+in-progress I/O and then start new I/O.
+
+The filesystem should unlock the page synchronously, before returning to the
+caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
+value. WRITEPAGE_ACTIVATE means that page cannot really be written out
+currently, and VM should stop calling ->writepage() on this page for some
+time. VM does this by moving page to the head of the active list, hence the
+name.
+
+Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
+and return zero, writepage *must* run set_page_writeback() against the page,
+followed by unlocking it. Once set_page_writeback() has been run against the
+page, write I/O can be submitted and the write I/O completion handler must run
+end_page_writeback() once the I/O is complete. If no I/O is submitted, the
+filesystem must run end_page_writeback() against the page before returning from
+writepage.
+
+That is: after 2.5.12, pages which are under writeout are *not* locked. Note,
+if the filesystem needs the page to be locked during writeout, that is ok, too,
+the page is allowed to be unlocked at any point in time between the calls to
+set_page_writeback() and end_page_writeback().
+
+Note, failure to run either redirty_page_for_writepage() or the combination of
+set_page_writeback()/end_page_writeback() on a page submitted to writepage
+will leave the page itself marked clean but it will be tagged as dirty in the
+radix tree. This incoherency can lead to all sorts of hard-to-debug problems
+in the filesystem like having dirty inodes at umount and losing written data.
+
+ ->sync_page() locking rules are not well-defined - usually it is called
+with lock on page, but that is not guaranteed. Considering the currently
+existing instances of this method ->sync_page() itself doesn't look
+well-defined...
+
+ ->writepages() is used for periodic writeback and for syscall-initiated
+sync operations. The address_space should start I/O against at least
+*nr_to_write pages. *nr_to_write must be decremented for each page which is
+written. The address_space implementation may write more (or less) pages
+than *nr_to_write asks for, but it should try to be reasonably close. If
+nr_to_write is NULL, all dirty pages must be written.
+
+writepages should _only_ write pages which are present on
+mapping->io_pages.
+
+ ->set_page_dirty() is called from various places in the kernel
+when the target page is marked as needing writeback. It may be called
+under spinlock (it cannot block) and is sometimes called with the page
+not locked.
+
+ ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
+filesystems and by the swapper. The latter will eventually go away. Please,
+keep it that way and don't breed new callers.
+
+ ->invalidatepage() is called when the filesystem must attempt to drop
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
+block_invalidatepage() instead.
+
+ ->releasepage() is called when the kernel is about to try to drop the
+buffers from the page in preparation for freeing it. It returns zero to
+indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
+the kernel assumes that the fs has no private interest in the buffers.
+
+ ->freepage() is called when the kernel is done dropping the page
+from the page cache.
+
+ ->launder_page() may be called prior to releasing a page if
+it is still found to be dirty. It returns zero if the page was successfully
+cleaned, or an error value if not. Note that in order to prevent the page
+getting mapped back in and redirtied, it needs to be kept locked
+across the entire operation.
+
+----------------------- file_lock_operations ------------------------------
+prototypes:
+ void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
+ void (*fl_release_private)(struct file_lock *);
+
+
+locking rules:
+ file_lock_lock may block
+fl_copy_lock: yes no
+fl_release_private: maybe no
+
+----------------------- lock_manager_operations ---------------------------
+prototypes:
+ int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+ void (*lm_notify)(struct file_lock *); /* unblock callback */
+ int (*lm_grant)(struct file_lock *, struct file_lock *, int);
+ void (*lm_release_private)(struct file_lock *);
+ void (*lm_break)(struct file_lock *); /* break_lease callback */
+ int (*lm_change)(struct file_lock **, int);
+
+locking rules:
+ file_lock_lock may block
+lm_compare_owner: yes no
+lm_notify: yes no
+lm_grant: no no
+lm_release_private: maybe no
+lm_break: yes no
+lm_change yes no
+
+--------------------------- buffer_head -----------------------------------
+prototypes:
+ void (*b_end_io)(struct buffer_head *bh, int uptodate);
+
+locking rules:
+ called from interrupts. In other words, extreme care is needed here.
+bh is locked, but that's all warranties we have here. Currently only RAID1,
+highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
+call this method upon the IO completion.
+
+--------------------------- block_device_operations -----------------------
+prototypes:
+ int (*open) (struct block_device *, fmode_t);
+ int (*release) (struct gendisk *, fmode_t);
+ int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
+ int (*media_changed) (struct gendisk *);
+ void (*unlock_native_capacity) (struct gendisk *);
+ int (*revalidate_disk) (struct gendisk *);
+ int (*getgeo)(struct block_device *, struct hd_geometry *);
+ void (*swap_slot_free_notify) (struct block_device *, unsigned long);
+
+locking rules:
+ bd_mutex
+open: yes
+release: yes
+ioctl: no
+compat_ioctl: no
+direct_access: no
+media_changed: no
+unlock_native_capacity: no
+revalidate_disk: no
+getgeo: no
+swap_slot_free_notify: no (see below)
+
+media_changed, unlock_native_capacity and revalidate_disk are called only from
+check_disk_change().
+
+swap_slot_free_notify is called with swap_lock and sometimes the page lock
+held.
+
+
+--------------------------- file_operations -------------------------------
+prototypes:
+ loff_t (*llseek) (struct file *, loff_t, int);
+ ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
+ ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ int (*readdir) (struct file *, void *, filldir_t);
+ unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
+ long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*open) (struct inode *, struct file *);
+ int (*flush) (struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
+ int (*aio_fsync) (struct kiocb *, int datasync);
+ int (*fasync) (int, struct file *, int);
+ int (*lock) (struct file *, int, struct file_lock *);
+ ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
+ loff_t *);
+ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
+ loff_t *);
+ ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
+ void __user *);
+ ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
+ loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+ int (*check_flags)(int);
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
+ size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
+ size_t, unsigned int);
+ int (*setlease)(struct file *, long, struct file_lock **);
+ long (*fallocate)(struct file *, int, loff_t, loff_t);
+};
+
+locking rules:
+ All may block except for ->setlease.
+ No VFS locks held on entry except for ->setlease.
+
+->setlease has the file_list_lock held and must not sleep.
+
+->llseek() locking has moved from llseek to the individual llseek
+implementations. If your fs is not using generic_file_llseek, you
+need to acquire and release the appropriate locks in your ->llseek().
+For many filesystems, it is probably safe to acquire the inode
+mutex or just to use i_size_read() instead.
+Note: this does not protect the file->f_pos against concurrent modifications
+since this is something the userspace has to take care about.
+
+->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
+Most instances call fasync_helper(), which does that maintenance, so it's
+not normally something one needs to worry about. Return values > 0 will be
+mapped to zero in the VFS layer.
+
+->readdir() and ->ioctl() on directories must be changed. Ideally we would
+move ->readdir() to inode_operations and use a separate method for directory
+->ioctl() or kill the latter completely. One of the problems is that for
+anything that resembles union-mount we won't have a struct file for all
+components. And there are other reasons why the current interface is a mess...
+
+->read on directories probably must go away - we should just enforce -EISDIR
+in sys_read() and friends.
+
+--------------------------- dquot_operations -------------------------------
+prototypes:
+ int (*write_dquot) (struct dquot *);
+ int (*acquire_dquot) (struct dquot *);
+ int (*release_dquot) (struct dquot *);
+ int (*mark_dirty) (struct dquot *);
+ int (*write_info) (struct super_block *, int);
+
+These operations are intended to be more or less wrapping functions that ensure
+a proper locking wrt the filesystem and call the generic quota operations.
+
+What filesystem should expect from the generic quota functions:
+
+ FS recursion Held locks when called
+write_dquot: yes dqonoff_sem or dqptr_sem
+acquire_dquot: yes dqonoff_sem or dqptr_sem
+release_dquot: yes dqonoff_sem or dqptr_sem
+mark_dirty: no -
+write_info: yes dqonoff_sem
+
+FS recursion means calling ->quota_read() and ->quota_write() from superblock
+operations.
+
+More details about quota locking can be found in fs/dquot.c.
+
+--------------------------- vm_operations_struct -----------------------------
+prototypes:
+ void (*open)(struct vm_area_struct*);
+ void (*close)(struct vm_area_struct*);
+ int (*fault)(struct vm_area_struct*, struct vm_fault *);
+ int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
+ int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
+
+locking rules:
+ mmap_sem PageLocked(page)
+open: yes
+close: yes
+fault: yes can return with page locked
+page_mkwrite: yes can return with page locked
+access: yes
+
+ ->fault() is called when a previously not present pte is about
+to be faulted in. The filesystem must find and return the page associated
+with the passed in "pgoff" in the vm_fault structure. If it is possible that
+the page may be truncated and/or invalidated, then the filesystem must lock
+the page, then ensure it is not already truncated (the page lock will block
+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
+locked. The VM will unlock the page.
+
+ ->page_mkwrite() is called when a previously read-only pte is
+about to become writeable. The filesystem again must ensure that there are
+no truncate/invalidate races, and then return with the page locked. If
+the page has been truncated, the filesystem should not look up a new page
+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
+will cause the VM to retry the fault.
+
+ ->access() is called when get_user_pages() fails in
+acces_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace. This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
+
+================================================================================
+ Dubious stuff
+
+(if you break something or notice that it is broken and do not fix it yourself
+- at least put it here)
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 00000000000..a5dd114da14
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := dnotify_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
diff --git a/Documentation/filesystems/adfs.txt b/Documentation/filesystems/adfs.txt
new file mode 100644
index 00000000000..5949766353f
--- /dev/null
+++ b/Documentation/filesystems/adfs.txt
@@ -0,0 +1,75 @@
+Mount options for ADFS
+----------------------
+
+ uid=nnn All files in the partition will be owned by
+ user id nnn. Default 0 (root).
+ gid=nnn All files in the partition will be in group
+ nnn. Default 0 (root).
+ ownmask=nnn The permission mask for ADFS 'owner' permissions
+ will be nnn. Default 0700.
+ othmask=nnn The permission mask for ADFS 'other' permissions
+ will be nnn. Default 0077.
+ ftsuffix=n When ftsuffix=0, no file type suffix will be applied.
+ When ftsuffix=1, a hexadecimal suffix corresponding to
+ the RISC OS file type will be added. Default 0.
+
+Mapping of ADFS permissions to Linux permissions
+------------------------------------------------
+
+ ADFS permissions consist of the following:
+
+ Owner read
+ Owner write
+ Other read
+ Other write
+
+ (In older versions, an 'execute' permission did exist, but this
+ does not hold the same meaning as the Linux 'execute' permission
+ and is now obsolete).
+
+ The mapping is performed as follows:
+
+ Owner read -> -r--r--r--
+ Owner write -> --w--w---w
+ Owner read and filetype UnixExec -> ---x--x--x
+ These are then masked by ownmask, eg 700 -> -rwx------
+ Possible owner mode permissions -> -rwx------
+
+ Other read -> -r--r--r--
+ Other write -> --w--w--w-
+ Other read and filetype UnixExec -> ---x--x--x
+ These are then masked by othmask, eg 077 -> ----rwxrwx
+ Possible other mode permissions -> ----rwxrwx
+
+ Hence, with the default masks, if a file is owner read/write, and
+ not a UnixExec filetype, then the permissions will be:
+
+ -rw-------
+
+ However, if the masks were ownmask=0770,othmask=0007, then this would
+ be modified to:
+ -rw-rw----
+
+ There is no restriction on what you can do with these masks. You may
+ wish that either read bits give read access to the file for all, but
+ keep the default write protection (ownmask=0755,othmask=0577):
+
+ -rw-r--r--
+
+ You can therefore tailor the permission translation to whatever you
+ desire the permissions should be under Linux.
+
+RISC OS file type suffix
+------------------------
+
+ RISC OS file types are stored in bits 19..8 of the file load address.
+
+ To enable non-RISC OS systems to be used to store files without losing
+ file type information, a file naming convention was devised (initially
+ for use with NFS) such that a hexadecimal suffix of the form ,xyz
+ denoted the file type: e.g. BasicFile,ffb is a BASIC (0xffb) file. This
+ naming convention is now also used by RISC OS emulators such as RPCEmu.
+
+ Mounting an ADFS disc with option ftsuffix=1 will cause appropriate file
+ type suffixes to be appended to file names read from a directory. If the
+ ftsuffix option is zero or omitted, no file type suffixes will be added.
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt
new file mode 100644
index 00000000000..81ac488e375
--- /dev/null
+++ b/Documentation/filesystems/affs.txt
@@ -0,0 +1,219 @@
+Overview of Amiga Filesystems
+=============================
+
+Not all varieties of the Amiga filesystems are supported for reading and
+writing. The Amiga currently knows six different filesystems:
+
+DOS\0 The old or original filesystem, not really suited for
+ hard disks and normally not used on them, either.
+ Supported read/write.
+
+DOS\1 The original Fast File System. Supported read/write.
+
+DOS\2 The old "international" filesystem. International means that
+ a bug has been fixed so that accented ("international") letters
+ in file names are case-insensitive, as they ought to be.
+ Supported read/write.
+
+DOS\3 The "international" Fast File System. Supported read/write.
+
+DOS\4 The original filesystem with directory cache. The directory
+ cache speeds up directory accesses on floppies considerably,
+ but slows down file creation/deletion. Doesn't make much
+ sense on hard disks. Supported read only.
+
+DOS\5 The Fast File System with directory cache. Supported read only.
+
+All of the above filesystems allow block sizes from 512 to 32K bytes.
+Supported block sizes are: 512, 1024, 2048 and 4096 bytes. Larger blocks
+speed up almost everything at the expense of wasted disk space. The speed
+gain above 4K seems not really worth the price, so you don't lose too
+much here, either.
+
+The muFS (multi user File System) equivalents of the above file systems
+are supported, too.
+
+Mount options for the AFFS
+==========================
+
+protect If this option is set, the protection bits cannot be altered.
+
+setuid[=uid] This sets the owner of all files and directories in the file
+ system to uid or the uid of the current user, respectively.
+
+setgid[=gid] Same as above, but for gid.
+
+mode=mode Sets the mode flags to the given (octal) value, regardless
+ of the original permissions. Directories will get an x
+ permission if the corresponding r bit is set.
+ This is useful since most of the plain AmigaOS files
+ will map to 600.
+
+reserved=num Sets the number of reserved blocks at the start of the
+ partition to num. You should never need this option.
+ Default is 2.
+
+root=block Sets the block number of the root block. This should never
+ be necessary.
+
+bs=blksize Sets the blocksize to blksize. Valid block sizes are 512,
+ 1024, 2048 and 4096. Like the root option, this should
+ never be necessary, as the affs can figure it out itself.
+
+quiet The file system will not return an error for disallowed
+ mode changes.
+
+verbose The volume name, file system type and block size will
+ be written to the syslog when the filesystem is mounted.
+
+mufs The filesystem is really a muFS, also it doesn't
+ identify itself as one. This option is necessary if
+ the filesystem wasn't formatted as muFS, but is used
+ as one.
+
+prefix=path Path will be prefixed to every absolute path name of
+ symbolic links on an AFFS partition. Default = "/".
+ (See below.)
+
+volume=name When symbolic links with an absolute path are created
+ on an AFFS partition, name will be prepended as the
+ volume name. Default = "" (empty string).
+ (See below.)
+
+Handling of the Users/Groups and protection flags
+=================================================
+
+Amiga -> Linux:
+
+The Amiga protection flags RWEDRWEDHSPARWED are handled as follows:
+
+ - R maps to r for user, group and others. On directories, R implies x.
+
+ - If both W and D are allowed, w will be set.
+
+ - E maps to x.
+
+ - H and P are always retained and ignored under Linux.
+
+ - A is always reset when a file is written to.
+
+User id and group id will be used unless set[gu]id are given as mount
+options. Since most of the Amiga file systems are single user systems
+they will be owned by root. The root directory (the mount point) of the
+Amiga filesystem will be owned by the user who actually mounts the
+filesystem (the root directory doesn't have uid/gid fields).
+
+Linux -> Amiga:
+
+The Linux rwxrwxrwx file mode is handled as follows:
+
+ - r permission will set R for user, group and others.
+
+ - w permission will set W and D for user, group and others.
+
+ - x permission of the user will set E for plain files.
+
+ - All other flags (suid, sgid, ...) are ignored and will
+ not be retained.
+
+Newly created files and directories will get the user and group ID
+of the current user and a mode according to the umask.
+
+Symbolic links
+==============
+
+Although the Amiga and Linux file systems resemble each other, there
+are some, not always subtle, differences. One of them becomes apparent
+with symbolic links. While Linux has a file system with exactly one
+root directory, the Amiga has a separate root directory for each
+file system (for example, partition, floppy disk, ...). With the Amiga,
+these entities are called "volumes". They have symbolic names which
+can be used to access them. Thus, symbolic links can point to a
+different volume. AFFS turns the volume name into a directory name
+and prepends the prefix path (see prefix option) to it.
+
+Example:
+You mount all your Amiga partitions under /amiga/<volume> (where
+<volume> is the name of the volume), and you give the option
+"prefix=/amiga/" when mounting all your AFFS partitions. (They
+might be "User", "WB" and "Graphics", the mount points /amiga/User,
+/amiga/WB and /amiga/Graphics). A symbolic link referring to
+"User:sc/include/dos/dos.h" will be followed to
+"/amiga/User/sc/include/dos/dos.h".
+
+Examples
+========
+
+Command line:
+ mount Archive/Amiga/Workbench3.1.adf /mnt -t affs -o loop,verbose
+ mount /dev/sda3 /Amiga -t affs
+
+/etc/fstab entry:
+ /dev/sdb5 /amiga/Workbench affs noauto,user,exec,verbose 0 0
+
+IMPORTANT NOTE
+==============
+
+If you boot Windows 95 (don't know about 3.x, 98 and NT) while you
+have an Amiga harddisk connected to your PC, it will overwrite
+the bytes 0x00dc..0x00df of block 0 with garbage, thus invalidating
+the Rigid Disk Block. Sheer luck has it that this is an unused
+area of the RDB, so only the checksum doesn't match anymore.
+Linux will ignore this garbage and recognize the RDB anyway, but
+before you connect that drive to your Amiga again, you must
+restore or repair your RDB. So please do make a backup copy of it
+before booting Windows!
+
+If the damage is already done, the following should fix the RDB
+(where <disk> is the device name).
+DO AT YOUR OWN RISK:
+
+ dd if=/dev/<disk> of=rdb.tmp count=1
+ cp rdb.tmp rdb.fixed
+ dd if=/dev/zero of=rdb.fixed bs=1 seek=220 count=4
+ dd if=rdb.fixed of=/dev/<disk>
+
+Bugs, Restrictions, Caveats
+===========================
+
+Quite a few things may not work as advertised. Not everything is
+tested, though several hundred MB have been read and written using
+this fs. For a most up-to-date list of bugs please consult
+fs/affs/Changes.
+
+Filenames are truncated to 30 characters without warning (this
+can be changed by setting the compile-time option AFFS_NO_TRUNCATE
+in include/linux/amigaffs.h).
+
+Case is ignored by the affs in filename matching, but Linux shells
+do care about the case. Example (with /wb being an affs mounted fs):
+ rm /wb/WRONGCASE
+will remove /mnt/wrongcase, but
+ rm /wb/WR*
+will not since the names are matched by the shell.
+
+The block allocation is designed for hard disk partitions. If more
+than 1 process writes to a (small) diskette, the blocks are allocated
+in an ugly way (but the real AFFS doesn't do much better). This
+is also true when space gets tight.
+
+You cannot execute programs on an OFS (Old File System), since the
+program files cannot be memory mapped due to the 488 byte blocks.
+For the same reason you cannot mount an image on such a filesystem
+via the loopback device.
+
+The bitmap valid flag in the root block may not be accurate when the
+system crashes while an affs partition is mounted. There's currently
+no way to fix a garbled filesystem without an Amiga (disk validator)
+or manually (who would do this?). Maybe later.
+
+If you mount affs partitions on system startup, you may want to tell
+fsck that the fs should not be checked (place a '0' in the sixth field
+of /etc/fstab).
+
+It's not possible to read floppy disks with a normal PC or workstation
+due to an incompatibility with the Amiga floppy controller.
+
+If you are interested in an Amiga Emulator for Linux, look at
+
+http://web.archive.org/web/*/http://www.freiburg.linux.de/~uae/
diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt
new file mode 100644
index 00000000000..ffef91c4e0d
--- /dev/null
+++ b/Documentation/filesystems/afs.txt
@@ -0,0 +1,247 @@
+ ====================
+ kAFS: AFS FILESYSTEM
+ ====================
+
+Contents:
+
+ - Overview.
+ - Usage.
+ - Mountpoints.
+ - Proc filesystem.
+ - The cell database.
+ - Security.
+ - Examples.
+
+
+========
+OVERVIEW
+========
+
+This filesystem provides a fairly simple secure AFS filesystem driver. It is
+under development and does not yet provide the full feature set. The features
+it does support include:
+
+ (*) Security (currently only AFS kaserver and KerberosIV tickets).
+
+ (*) File reading and writing.
+
+ (*) Automounting.
+
+ (*) Local caching (via fscache).
+
+It does not yet support the following AFS features:
+
+ (*) pioctl() system call.
+
+
+===========
+COMPILATION
+===========
+
+The filesystem should be enabled by turning on the kernel configuration
+options:
+
+ CONFIG_AF_RXRPC - The RxRPC protocol transport
+ CONFIG_RXKAD - The RxRPC Kerberos security handler
+ CONFIG_AFS - The AFS filesystem
+
+Additionally, the following can be turned on to aid debugging:
+
+ CONFIG_AF_RXRPC_DEBUG - Permit AF_RXRPC debugging to be enabled
+ CONFIG_AFS_DEBUG - Permit AFS debugging to be enabled
+
+They permit the debugging messages to be turned on dynamically by manipulating
+the masks in the following files:
+
+ /sys/module/af_rxrpc/parameters/debug
+ /sys/module/kafs/parameters/debug
+
+
+=====
+USAGE
+=====
+
+When inserting the driver modules the root cell must be specified along with a
+list of volume location server IP addresses:
+
+ modprobe af_rxrpc
+ modprobe rxkad
+ modprobe kafs rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91
+
+The first module is the AF_RXRPC network protocol driver. This provides the
+RxRPC remote operation protocol and may also be accessed from userspace. See:
+
+ Documentation/networking/rxrpc.txt
+
+The second module is the kerberos RxRPC security driver, and the third module
+is the actual filesystem driver for the AFS filesystem.
+
+Once the module has been loaded, more modules can be added by the following
+procedure:
+
+ echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells
+
+Where the parameters to the "add" command are the name of a cell and a list of
+volume location servers within that cell, with the latter separated by colons.
+
+Filesystems can be mounted anywhere by commands similar to the following:
+
+ mount -t afs "%cambridge.redhat.com:root.afs." /afs
+ mount -t afs "#cambridge.redhat.com:root.cell." /afs/cambridge
+ mount -t afs "#root.afs." /afs
+ mount -t afs "#root.cell." /afs/cambridge
+
+Where the initial character is either a hash or a percent symbol depending on
+whether you definitely want a R/W volume (hash) or whether you'd prefer a R/O
+volume, but are willing to use a R/W volume instead (percent).
+
+The name of the volume can be suffixes with ".backup" or ".readonly" to
+specify connection to only volumes of those types.
+
+The name of the cell is optional, and if not given during a mount, then the
+named volume will be looked up in the cell specified during modprobe.
+
+Additional cells can be added through /proc (see later section).
+
+
+===========
+MOUNTPOINTS
+===========
+
+AFS has a concept of mountpoints. In AFS terms, these are specially formatted
+symbolic links (of the same form as the "device name" passed to mount). kAFS
+presents these to the user as directories that have a follow-link capability
+(ie: symbolic link semantics). If anyone attempts to access them, they will
+automatically cause the target volume to be mounted (if possible) on that site.
+
+Automatically mounted filesystems will be automatically unmounted approximately
+twenty minutes after they were last used. Alternatively they can be unmounted
+directly with the umount() system call.
+
+Manually unmounting an AFS volume will cause any idle submounts upon it to be
+culled first. If all are culled, then the requested volume will also be
+unmounted, otherwise error EBUSY will be returned.
+
+This can be used by the administrator to attempt to unmount the whole AFS tree
+mounted on /afs in one go by doing:
+
+ umount /afs
+
+
+===============
+PROC FILESYSTEM
+===============
+
+The AFS modules creates a "/proc/fs/afs/" directory and populates it:
+
+ (*) A "cells" file that lists cells currently known to the afs module and
+ their usage counts:
+
+ [root@andromeda ~]# cat /proc/fs/afs/cells
+ USE NAME
+ 3 cambridge.redhat.com
+
+ (*) A directory per cell that contains files that list volume location
+ servers, volumes, and active servers known within that cell.
+
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/servers
+ USE ADDR STATE
+ 4 172.16.18.91 0
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/vlservers
+ ADDRESS
+ 172.16.18.91
+ [root@andromeda ~]# cat /proc/fs/afs/cambridge.redhat.com/volumes
+ USE STT VLID[0] VLID[1] VLID[2] NAME
+ 1 Val 20000000 20000001 20000002 root.afs
+
+
+=================
+THE CELL DATABASE
+=================
+
+The filesystem maintains an internal database of all the cells it knows and the
+IP addresses of the volume location servers for those cells. The cell to which
+the system belongs is added to the database when modprobe is performed by the
+"rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on
+the kernel command line.
+
+Further cells can be added by commands similar to the following:
+
+ echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells
+ echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells
+
+No other cell database operations are available at this time.
+
+
+========
+SECURITY
+========
+
+Secure operations are initiated by acquiring a key using the klog program. A
+very primitive klog program is available at:
+
+ http://people.redhat.com/~dhowells/rxrpc/klog.c
+
+This should be compiled by:
+
+ make klog LDLIBS="-lcrypto -lcrypt -lkrb4 -lkeyutils"
+
+And then run as:
+
+ ./klog
+
+Assuming it's successful, this adds a key of type RxRPC, named for the service
+and cell, eg: "afs@<cellname>". This can be viewed with the keyctl program or
+by cat'ing /proc/keys:
+
+ [root@andromeda ~]# keyctl show
+ Session Keyring
+ -3 --alswrv 0 0 keyring: _ses.3268
+ 2 --alswrv 0 0 \_ keyring: _uid.0
+ 111416553 --als--v 0 0 \_ rxrpc: afs@CAMBRIDGE.REDHAT.COM
+
+Currently the username, realm, password and proposed ticket lifetime are
+compiled in to the program.
+
+It is not required to acquire a key before using AFS facilities, but if one is
+not acquired then all operations will be governed by the anonymous user parts
+of the ACLs.
+
+If a key is acquired, then all AFS operations, including mounts and automounts,
+made by a possessor of that key will be secured with that key.
+
+If a file is opened with a particular key and then the file descriptor is
+passed to a process that doesn't have that key (perhaps over an AF_UNIX
+socket), then the operations on the file will be made with key that was used to
+open the file.
+
+
+========
+EXAMPLES
+========
+
+Here's what I use to test this. Some of the names and IP addresses are local
+to my internal DNS. My "root.afs" partition has a mount point within it for
+some public volumes volumes.
+
+insmod /tmp/rxrpc.o
+insmod /tmp/rxkad.o
+insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91
+
+mount -t afs \%root.afs. /afs
+mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/
+
+echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 > /proc/fs/afs/cells
+mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/
+mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive
+mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib
+mount -t afs "#grand.central.org:root.doc." /afs/grand.central.org/doc
+mount -t afs "#grand.central.org:root.project." /afs/grand.central.org/project
+mount -t afs "#grand.central.org:root.service." /afs/grand.central.org/service
+mount -t afs "#grand.central.org:root.software." /afs/grand.central.org/software
+mount -t afs "#grand.central.org:root.user." /afs/grand.central.org/user
+
+umount /afs
+rmmod kafs
+rmmod rxkad
+rmmod rxrpc
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
new file mode 100644
index 00000000000..4c95935cbcf
--- /dev/null
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -0,0 +1,393 @@
+
+Miscellaneous Device control operations for the autofs4 kernel module
+====================================================================
+
+The problem
+===========
+
+There is a problem with active restarts in autofs (that is to say
+restarting autofs when there are busy mounts).
+
+During normal operation autofs uses a file descriptor opened on the
+directory that is being managed in order to be able to issue control
+operations. Using a file descriptor gives ioctl operations access to
+autofs specific information stored in the super block. The operations
+are things such as setting an autofs mount catatonic, setting the
+expire timeout and requesting expire checks. As is explained below,
+certain types of autofs triggered mounts can end up covering an autofs
+mount itself which prevents us being able to use open(2) to obtain a
+file descriptor for these operations if we don't already have one open.
+
+Currently autofs uses "umount -l" (lazy umount) to clear active mounts
+at restart. While using lazy umount works for most cases, anything that
+needs to walk back up the mount tree to construct a path, such as
+getcwd(2) and the proc file system /proc/<pid>/cwd, no longer works
+because the point from which the path is constructed has been detached
+from the mount tree.
+
+The actual problem with autofs is that it can't reconnect to existing
+mounts. Immediately one thinks of just adding the ability to remount
+autofs file systems would solve it, but alas, that can't work. This is
+because autofs direct mounts and the implementation of "on demand mount
+and expire" of nested mount trees have the file system mounted directly
+on top of the mount trigger directory dentry.
+
+For example, there are two types of automount maps, direct (in the kernel
+module source you will see a third type called an offset, which is just
+a direct mount in disguise) and indirect.
+
+Here is a master map with direct and indirect map entries:
+
+/- /etc/auto.direct
+/test /etc/auto.indirect
+
+and the corresponding map files:
+
+/etc/auto.direct:
+
+/automount/dparse/g6 budgie:/autofs/export1
+/automount/dparse/g1 shark:/autofs/export1
+and so on.
+
+/etc/auto.indirect:
+
+g1 shark:/autofs/export1
+g6 budgie:/autofs/export1
+and so on.
+
+For the above indirect map an autofs file system is mounted on /test and
+mounts are triggered for each sub-directory key by the inode lookup
+operation. So we see a mount of shark:/autofs/export1 on /test/g1, for
+example.
+
+The way that direct mounts are handled is by making an autofs mount on
+each full path, such as /automount/dparse/g1, and using it as a mount
+trigger. So when we walk on the path we mount shark:/autofs/export1 "on
+top of this mount point". Since these are always directories we can
+use the follow_link inode operation to trigger the mount.
+
+But, each entry in direct and indirect maps can have offsets (making
+them multi-mount map entries).
+
+For example, an indirect mount map entry could also be:
+
+g1 \
+ / shark:/autofs/export5/testing/test \
+ /s1 shark:/autofs/export/testing/test/s1 \
+ /s2 shark:/autofs/export5/testing/test/s2 \
+ /s1/ss1 shark:/autofs/export1 \
+ /s2/ss2 shark:/autofs/export2
+
+and a similarly a direct mount map entry could also be:
+
+/automount/dparse/g1 \
+ / shark:/autofs/export5/testing/test \
+ /s1 shark:/autofs/export/testing/test/s1 \
+ /s2 shark:/autofs/export5/testing/test/s2 \
+ /s1/ss1 shark:/autofs/export2 \
+ /s2/ss2 shark:/autofs/export2
+
+One of the issues with version 4 of autofs was that, when mounting an
+entry with a large number of offsets, possibly with nesting, we needed
+to mount and umount all of the offsets as a single unit. Not really a
+problem, except for people with a large number of offsets in map entries.
+This mechanism is used for the well known "hosts" map and we have seen
+cases (in 2.4) where the available number of mounts are exhausted or
+where the number of privileged ports available is exhausted.
+
+In version 5 we mount only as we go down the tree of offsets and
+similarly for expiring them which resolves the above problem. There is
+somewhat more detail to the implementation but it isn't needed for the
+sake of the problem explanation. The one important detail is that these
+offsets are implemented using the same mechanism as the direct mounts
+above and so the mount points can be covered by a mount.
+
+The current autofs implementation uses an ioctl file descriptor opened
+on the mount point for control operations. The references held by the
+descriptor are accounted for in checks made to determine if a mount is
+in use and is also used to access autofs file system information held
+in the mount super block. So the use of a file handle needs to be
+retained.
+
+
+The Solution
+============
+
+To be able to restart autofs leaving existing direct, indirect and
+offset mounts in place we need to be able to obtain a file handle
+for these potentially covered autofs mount points. Rather than just
+implement an isolated operation it was decided to re-implement the
+existing ioctl interface and add new operations to provide this
+functionality.
+
+In addition, to be able to reconstruct a mount tree that has busy mounts,
+the uid and gid of the last user that triggered the mount needs to be
+available because these can be used as macro substitution variables in
+autofs maps. They are recorded at mount request time and an operation
+has been added to retrieve them.
+
+Since we're re-implementing the control interface, a couple of other
+problems with the existing interface have been addressed. First, when
+a mount or expire operation completes a status is returned to the
+kernel by either a "send ready" or a "send fail" operation. The
+"send fail" operation of the ioctl interface could only ever send
+ENOENT so the re-implementation allows user space to send an actual
+status. Another expensive operation in user space, for those using
+very large maps, is discovering if a mount is present. Usually this
+involves scanning /proc/mounts and since it needs to be done quite
+often it can introduce significant overhead when there are many entries
+in the mount table. An operation to lookup the mount status of a mount
+point dentry (covered or not) has also been added.
+
+Current kernel development policy recommends avoiding the use of the
+ioctl mechanism in favor of systems such as Netlink. An implementation
+using this system was attempted to evaluate its suitability and it was
+found to be inadequate, in this case. The Generic Netlink system was
+used for this as raw Netlink would lead to a significant increase in
+complexity. There's no question that the Generic Netlink system is an
+elegant solution for common case ioctl functions but it's not a complete
+replacement probably because its primary purpose in life is to be a
+message bus implementation rather than specifically an ioctl replacement.
+While it would be possible to work around this there is one concern
+that lead to the decision to not use it. This is that the autofs
+expire in the daemon has become far to complex because umount
+candidates are enumerated, almost for no other reason than to "count"
+the number of times to call the expire ioctl. This involves scanning
+the mount table which has proved to be a big overhead for users with
+large maps. The best way to improve this is try and get back to the
+way the expire was done long ago. That is, when an expire request is
+issued for a mount (file handle) we should continually call back to
+the daemon until we can't umount any more mounts, then return the
+appropriate status to the daemon. At the moment we just expire one
+mount at a time. A Generic Netlink implementation would exclude this
+possibility for future development due to the requirements of the
+message bus architecture.
+
+
+autofs4 Miscellaneous Device mount control interface
+====================================================
+
+The control interface is opening a device node, typically /dev/autofs.
+
+All the ioctls use a common structure to pass the needed parameter
+information and return operation results:
+
+struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ __u32 arg1; /* Command parameters */
+ __u32 arg2;
+
+ char path[0];
+};
+
+The ioctlfd field is a mount point file descriptor of an autofs mount
+point. It is returned by the open call and is used by all calls except
+the check for whether a given path is a mount point, where it may
+optionally be used to check a specific mount corresponding to a given
+mount point file descriptor, and when requesting the uid and gid of the
+last successful mount on a directory within the autofs file system.
+
+The fields arg1 and arg2 are used to communicate parameters and results of
+calls made as described below.
+
+The path field is used to pass a path where it is needed and the size field
+is used account for the increased structure length when translating the
+structure sent from user space.
+
+This structure can be initialized before setting specific fields by using
+the void function call init_autofs_dev_ioctl(struct autofs_dev_ioctl *).
+
+All of the ioctls perform a copy of this structure from user space to
+kernel space and return -EINVAL if the size parameter is smaller than
+the structure size itself, -ENOMEM if the kernel memory allocation fails
+or -EFAULT if the copy itself fails. Other checks include a version check
+of the compiled in user space version against the module version and a
+mismatch results in a -EINVAL return. If the size field is greater than
+the structure size then a path is assumed to be present and is checked to
+ensure it begins with a "/" and is NULL terminated, otherwise -EINVAL is
+returned. Following these checks, for all ioctl commands except
+AUTOFS_DEV_IOCTL_VERSION_CMD, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD and
+AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD the ioctlfd is validated and if it is
+not a valid descriptor or doesn't correspond to an autofs mount point
+an error of -EBADF, -ENOTTY or -EINVAL (not an autofs descriptor) is
+returned.
+
+
+The ioctls
+==========
+
+An example of an implementation which uses this interface can be seen
+in autofs version 5.0.4 and later in file lib/dev-ioctl-lib.c of the
+distribution tar available for download from kernel.org in directory
+/pub/linux/daemons/autofs/v5.
+
+The device node ioctl operations implemented by this interface are:
+
+
+AUTOFS_DEV_IOCTL_VERSION
+------------------------
+
+Get the major and minor version of the autofs4 device ioctl kernel module
+implementation. It requires an initialized struct autofs_dev_ioctl as an
+input parameter and sets the version information in the passed in structure.
+It returns 0 on success or the error -EINVAL if a version mismatch is
+detected.
+
+
+AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD
+------------------------------------------------------------------
+
+Get the major and minor version of the autofs4 protocol version understood
+by loaded module. This call requires an initialized struct autofs_dev_ioctl
+with the ioctlfd field set to a valid autofs mount point descriptor
+and sets the requested version number in structure field arg1. These
+commands return 0 on success or one of the negative error codes if
+validation fails.
+
+
+AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
+----------------------------------------------------------
+
+Obtain and release a file descriptor for an autofs managed mount point
+path. The open call requires an initialized struct autofs_dev_ioctl with
+the the path field set and the size field adjusted appropriately as well
+as the arg1 field set to the device number of the autofs mount. The
+device number can be obtained from the mount options shown in
+/proc/mounts. The close call requires an initialized struct
+autofs_dev_ioct with the ioctlfd field set to the descriptor obtained
+from the open call. The release of the file descriptor can also be done
+with close(2) so any open descriptors will also be closed at process exit.
+The close call is included in the implemented operations largely for
+completeness and to provide for a consistent user space implementation.
+
+
+AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD
+--------------------------------------------------------
+
+Return mount and expire result status from user space to the kernel.
+Both of these calls require an initialized struct autofs_dev_ioctl
+with the ioctlfd field set to the descriptor obtained from the open
+call and the arg1 field set to the wait queue token number, received
+by user space in the foregoing mount or expire request. The arg2 field
+is set to the status to be returned. For the ready call this is always
+0 and for the fail call it is set to the errno of the operation.
+
+
+AUTOFS_DEV_IOCTL_SETPIPEFD_CMD
+------------------------------
+
+Set the pipe file descriptor used for kernel communication to the daemon.
+Normally this is set at mount time using an option but when reconnecting
+to a existing mount we need to use this to tell the autofs mount about
+the new kernel pipe descriptor. In order to protect mounts against
+incorrectly setting the pipe descriptor we also require that the autofs
+mount be catatonic (see next call).
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call and
+the arg1 field set to descriptor of the pipe. On success the call
+also sets the process group id used to identify the controlling process
+(eg. the owning automount(8) daemon) to the process group of the caller.
+
+
+AUTOFS_DEV_IOCTL_CATATONIC_CMD
+------------------------------
+
+Make the autofs mount point catatonic. The autofs mount will no longer
+issue mount requests, the kernel communication pipe descriptor is released
+and any remaining waits in the queue released.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call.
+
+
+AUTOFS_DEV_IOCTL_TIMEOUT_CMD
+----------------------------
+
+Set the expire timeout for mounts within an autofs mount point.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call.
+
+
+AUTOFS_DEV_IOCTL_REQUESTER_CMD
+------------------------------
+
+Return the uid and gid of the last process to successfully trigger a the
+mount on the given path dentry.
+
+The call requires an initialized struct autofs_dev_ioctl with the path
+field set to the mount point in question and the size field adjusted
+appropriately as well as the arg1 field set to the device number of the
+containing autofs mount. Upon return the struct field arg1 contains the
+uid and arg2 the gid.
+
+When reconstructing an autofs mount tree with active mounts we need to
+re-connect to mounts that may have used the original process uid and
+gid (or string variations of them) for mount lookups within the map entry.
+This call provides the ability to obtain this uid and gid so they may be
+used by user space for the mount map lookups.
+
+
+AUTOFS_DEV_IOCTL_EXPIRE_CMD
+---------------------------
+
+Issue an expire request to the kernel for an autofs mount. Typically
+this ioctl is called until no further expire candidates are found.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call. In
+addition an immediate expire, independent of the mount timeout, can be
+requested by setting the arg1 field to 1. If no expire candidates can
+be found the ioctl returns -1 with errno set to EAGAIN.
+
+This call causes the kernel module to check the mount corresponding
+to the given ioctlfd for mounts that can be expired, issues an expire
+request back to the daemon and waits for completion.
+
+AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD
+------------------------------
+
+Checks if an autofs mount point is in use.
+
+The call requires an initialized struct autofs_dev_ioctl with the
+ioctlfd field set to the descriptor obtained from the open call and
+it returns the result in the arg1 field, 1 for busy and 0 otherwise.
+
+
+AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+---------------------------------
+
+Check if the given path is a mountpoint.
+
+The call requires an initialized struct autofs_dev_ioctl. There are two
+possible variations. Both use the path field set to the path of the mount
+point to check and the size field adjusted appropriately. One uses the
+ioctlfd field to identify a specific mount point to check while the other
+variation uses the path and optionally arg1 set to an autofs mount type.
+The call returns 1 if this is a mount point and sets arg1 to the device
+number of the mount and field arg2 to the relevant super block magic
+number (described below) or 0 if it isn't a mountpoint. In both cases
+the the device number (as returned by new_encode_dev()) is returned
+in field arg1.
+
+If supplied with a file descriptor we're looking for a specific mount,
+not necessarily at the top of the mounted stack. In this case the path
+the descriptor corresponds to is considered a mountpoint if it is itself
+a mountpoint or contains a mount, such as a multi-mount without a root
+mount. In this case we return 1 if the descriptor corresponds to a mount
+point and and also returns the super magic of the covering mount if there
+is one or 0 if it isn't a mountpoint.
+
+If a path is supplied (and the ioctlfd field is set to -1) then the path
+is looked up and is checked to see if it is the root of a mount. If a
+type is also given we are looking for a particular autofs mount and if
+a match isn't found a fail is returned. If the the located path is the
+root of a mount 1 is returned along with the super magic of the mount
+or 0 otherwise.
+
diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt
new file mode 100644
index 00000000000..7cac200e2a8
--- /dev/null
+++ b/Documentation/filesystems/automount-support.txt
@@ -0,0 +1,118 @@
+Support is available for filesystems that wish to do automounting support (such
+as kAFS which can be found in fs/afs/). This facility includes allowing
+in-kernel mounts to be performed and mountpoint degradation to be
+requested. The latter can also be requested by userspace.
+
+
+======================
+IN-KERNEL AUTOMOUNTING
+======================
+
+A filesystem can now mount another filesystem on one of its directories by the
+following procedure:
+
+ (1) Give the directory a follow_link() operation.
+
+ When the directory is accessed, the follow_link op will be called, and
+ it will be provided with the location of the mountpoint in the nameidata
+ structure (vfsmount and dentry).
+
+ (2) Have the follow_link() op do the following steps:
+
+ (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
+ superblock and gain a vfsmount structure representing it.
+
+ (b) Copy the nameidata provided as an argument and substitute the dentry
+ argument into it the copy.
+
+ (c) Call do_add_mount() to install the new vfsmount into the namespace's
+ mountpoint tree, thus making it accessible to userspace. Use the
+ nameidata set up in (b) as the destination.
+
+ If the mountpoint will be automatically expired, then do_add_mount()
+ should also be given the location of an expiration list (see further
+ down).
+
+ (d) Release the path in the nameidata argument and substitute in the new
+ vfsmount and its root dentry. The ref counts on these will need
+ incrementing.
+
+Then from userspace, you can just do something like:
+
+ [root@andromeda root]# mount -t afs \#root.afs. /afs
+ [root@andromeda root]# ls /afs
+ asd cambridge cambridge.redhat.com grand.central.org
+ [root@andromeda root]# ls /afs/cambridge
+ afsdoc
+ [root@andromeda root]# ls /afs/cambridge/afsdoc/
+ ChangeLog html LICENSE pdf RELNOTES-1.2.2
+
+And then if you look in the mountpoint catalogue, you'll see something like:
+
+ [root@andromeda root]# cat /proc/mounts
+ ...
+ #root.afs. /afs afs rw 0 0
+ #root.cell. /afs/cambridge.redhat.com afs rw 0 0
+ #afsdoc. /afs/cambridge.redhat.com/afsdoc afs rw 0 0
+
+
+===========================
+AUTOMATIC MOUNTPOINT EXPIRY
+===========================
+
+Automatic expiration of mountpoints is easy, provided you've mounted the
+mountpoint to be expired in the automounting procedure outlined above.
+
+To do expiration, you need to follow these steps:
+
+ (3) Create at least one list off which the vfsmounts to be expired can be
+ hung. Access to this list will be governed by the vfsmount_lock.
+
+ (4) In step (2c) above, the call to do_add_mount() should be provided with a
+ pointer to this list. It will hang the vfsmount off of it if it succeeds.
+
+ (5) When you want mountpoints to be expired, call mark_mounts_for_expiry()
+ with a pointer to this list. This will process the list, marking every
+ vfsmount thereon for potential expiry on the next call.
+
+ If a vfsmount was already flagged for expiry, and if its usage count is 1
+ (it's only referenced by its parent vfsmount), then it will be deleted
+ from the namespace and thrown away (effectively unmounted).
+
+ It may prove simplest to simply call this at regular intervals, using
+ some sort of timed event to drive it.
+
+The expiration flag is cleared by calls to mntput. This means that expiration
+will only happen on the second expiration request after the last time the
+mountpoint was accessed.
+
+If a mountpoint is moved, it gets removed from the expiration list. If a bind
+mount is made on an expirable mount, the new vfsmount will not be on the
+expiration list and will not expire.
+
+If a namespace is copied, all mountpoints contained therein will be copied,
+and the copies of those that are on an expiration list will be added to the
+same expiration list.
+
+
+=======================
+USERSPACE DRIVEN EXPIRY
+=======================
+
+As an alternative, it is possible for userspace to request expiry of any
+mountpoint (though some will be rejected - the current process's idea of the
+rootfs for example). It does this by passing the MNT_EXPIRE flag to
+umount(). This flag is considered incompatible with MNT_FORCE and MNT_DETACH.
+
+If the mountpoint in question is in referenced by something other than
+umount() or its parent mountpoint, an EBUSY error will be returned and the
+mountpoint will not be marked for expiration or unmounted.
+
+If the mountpoint was not already marked for expiry at that time, an EAGAIN
+error will be given and it won't be unmounted.
+
+Otherwise if it was already marked and it wasn't referenced, unmounting will
+take place as usual.
+
+Again, the expiration flag is cleared every time anything other than umount()
+looks at a mountpoint.
diff --git a/Documentation/filesystems/befs.txt b/Documentation/filesystems/befs.txt
new file mode 100644
index 00000000000..da45e6c842b
--- /dev/null
+++ b/Documentation/filesystems/befs.txt
@@ -0,0 +1,117 @@
+BeOS filesystem for Linux
+
+Document last updated: Dec 6, 2001
+
+WARNING
+=======
+Make sure you understand that this is alpha software. This means that the
+implementation is neither complete nor well-tested.
+
+I DISCLAIM ALL RESPONSIBILITY FOR ANY POSSIBLE BAD EFFECTS OF THIS CODE!
+
+LICENSE
+=====
+This software is covered by the GNU General Public License.
+See the file COPYING for the complete text of the license.
+Or the GNU website: <http://www.gnu.org/licenses/licenses.html>
+
+AUTHOR
+=====
+The largest part of the code written by Will Dyson <will_dyson@pobox.com>
+He has been working on the code since Aug 13, 2001. See the changelog for
+details.
+
+Original Author: Makoto Kato <m_kato@ga2.so-net.ne.jp>
+His original code can still be found at:
+<http://hp.vector.co.jp/authors/VA008030/bfs/>
+Does anyone know of a more current email address for Makoto? He doesn't
+respond to the address given above...
+
+This filesystem doesn't have a maintainer.
+
+WHAT IS THIS DRIVER?
+==================
+This module implements the native filesystem of BeOS http://www.beincorporated.com/
+for the linux 2.4.1 and later kernels. Currently it is a read-only
+implementation.
+
+Which is it, BFS or BEFS?
+================
+Be, Inc said, "BeOS Filesystem is officially called BFS, not BeFS".
+But Unixware Boot Filesystem is called bfs, too. And they are already in
+the kernel. Because of this naming conflict, on Linux the BeOS
+filesystem is called befs.
+
+HOW TO INSTALL
+==============
+step 1. Install the BeFS patch into the source code tree of linux.
+
+Apply the patchfile to your kernel source tree.
+Assuming that your kernel source is in /foo/bar/linux and the patchfile
+is called patch-befs-xxx, you would do the following:
+
+ cd /foo/bar/linux
+ patch -p1 < /path/to/patch-befs-xxx
+
+if the patching step fails (i.e. there are rejected hunks), you can try to
+figure it out yourself (it shouldn't be hard), or mail the maintainer
+(Will Dyson <will_dyson@pobox.com>) for help.
+
+step 2. Configuration & make kernel
+
+The linux kernel has many compile-time options. Most of them are beyond the
+scope of this document. I suggest the Kernel-HOWTO document as a good general
+reference on this topic. http://www.linuxdocs.org/HOWTOs/Kernel-HOWTO-4.html
+
+However, to use the BeFS module, you must enable it at configure time.
+
+ cd /foo/bar/linux
+ make menuconfig (or xconfig)
+
+The BeFS module is not a standard part of the linux kernel, so you must first
+enable support for experimental code under the "Code maturity level" menu.
+
+Then, under the "Filesystems" menu will be an option called "BeFS
+filesystem (experimental)", or something like that. Enable that option
+(it is fine to make it a module).
+
+Save your kernel configuration and then build your kernel.
+
+step 3. Install
+
+See the kernel howto <http://www.linux.com/howto/Kernel-HOWTO.html> for
+instructions on this critical step.
+
+USING BFS
+=========
+To use the BeOS filesystem, use filesystem type 'befs'.
+
+ex)
+ mount -t befs /dev/fd0 /beos
+
+MOUNT OPTIONS
+=============
+uid=nnn All files in the partition will be owned by user id nnn.
+gid=nnn All files in the partition will be in group nnn.
+iocharset=xxx Use xxx as the name of the NLS translation table.
+debug The driver will output debugging information to the syslog.
+
+HOW TO GET LASTEST VERSION
+==========================
+
+The latest version is currently available at:
+<http://befs-driver.sourceforge.net/>
+
+ANY KNOWN BUGS?
+===========
+As of Jan 20, 2002:
+
+ None
+
+SPECIAL THANKS
+==============
+Dominic Giampalo ... Writing "Practical file system design with Be filesystem"
+Hiroyuki Yamada ... Testing LinuxPPC.
+
+
+
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt
new file mode 100644
index 00000000000..78043d5a8fc
--- /dev/null
+++ b/Documentation/filesystems/bfs.txt
@@ -0,0 +1,57 @@
+BFS FILESYSTEM FOR LINUX
+========================
+
+The BFS filesystem is used by SCO UnixWare OS for the /stand slice, which
+usually contains the kernel image and a few other files required for the
+boot process.
+
+In order to access /stand partition under Linux you obviously need to
+know the partition number and the kernel must support UnixWare disk slices
+(CONFIG_UNIXWARE_DISKLABEL config option). However BFS support does not
+depend on having UnixWare disklabel support because one can also mount
+BFS filesystem via loopback:
+
+# losetup /dev/loop0 stand.img
+# mount -t bfs /dev/loop0 /mnt/stand
+
+where stand.img is a file containing the image of BFS filesystem.
+When you have finished using it and umounted you need to also deallocate
+/dev/loop0 device by:
+
+# losetup -d /dev/loop0
+
+You can simplify mounting by just typing:
+
+# mount -t bfs -o loop stand.img /mnt/stand
+
+this will allocate the first available loopback device (and load loop.o
+kernel module if necessary) automatically. If the loopback driver is not
+loaded automatically, make sure that you have compiled the module and
+that modprobe is functioning. Beware that umount will not deallocate
+/dev/loopN device if /etc/mtab file on your system is a symbolic link to
+/proc/mounts. You will need to do it manually using "-d" switch of
+losetup(8). Read losetup(8) manpage for more info.
+
+To create the BFS image under UnixWare you need to find out first which
+slice contains it. The command prtvtoc(1M) is your friend:
+
+# prtvtoc /dev/rdsk/c0b0t0d0s0
+
+(assuming your root disk is on target=0, lun=0, bus=0, controller=0). Then you
+look for the slice with tag "STAND", which is usually slice 10. With this
+information you can use dd(1) to create the BFS image:
+
+# umount /stand
+# dd if=/dev/rdsk/c0b0t0d0sa of=stand.img bs=512
+
+Just in case, you can verify that you have done the right thing by checking
+the magic number:
+
+# od -Ad -tx4 stand.img | more
+
+The first 4 bytes should be 0x1badface.
+
+If you have any patches, questions or suggestions regarding this BFS
+implementation please contact the author:
+
+Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
new file mode 100644
index 00000000000..7671352216f
--- /dev/null
+++ b/Documentation/filesystems/btrfs.txt
@@ -0,0 +1,91 @@
+
+ BTRFS
+ =====
+
+Btrfs is a new copy on write filesystem for Linux aimed at
+implementing advanced features while focusing on fault tolerance,
+repair and easy administration. Initially developed by Oracle, Btrfs
+is licensed under the GPL and open for contribution from anyone.
+
+Linux has a wealth of filesystems to choose from, but we are facing a
+number of challenges with scaling to the large storage subsystems that
+are becoming common in today's data centers. Filesystems need to scale
+in their ability to address and manage large storage, and also in
+their ability to detect, repair and tolerate errors in the data stored
+on disk. Btrfs is under heavy development, and is not suitable for
+any uses other than benchmarking and review. The Btrfs disk format is
+not yet finalized.
+
+The main Btrfs features include:
+
+ * Extent based file storage (2^64 max file size)
+ * Space efficient packing of small files
+ * Space efficient indexed directories
+ * Dynamic inode allocation
+ * Writable snapshots
+ * Subvolumes (separate internal filesystem roots)
+ * Object level mirroring and striping
+ * Checksums on data and metadata (multiple algorithms available)
+ * Compression
+ * Integrated multiple device support, with several raid algorithms
+ * Online filesystem check (not yet implemented)
+ * Very fast offline filesystem check
+ * Efficient incremental backup and FS mirroring (not yet implemented)
+ * Online filesystem defragmentation
+
+
+
+ MAILING LIST
+ ============
+
+There is a Btrfs mailing list hosted on vger.kernel.org. You can
+find details on how to subscribe here:
+
+http://vger.kernel.org/vger-lists.html#linux-btrfs
+
+Mailing list archives are available from gmane:
+
+http://dir.gmane.org/gmane.comp.file-systems.btrfs
+
+
+
+ IRC
+ ===
+
+Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
+IRC network.
+
+
+
+ UTILITIES
+ =========
+
+Userspace tools for creating and manipulating Btrfs file systems are
+available from the git repository at the following location:
+
+ http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs.git
+ git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs.git
+
+These include the following tools:
+
+mkfs.btrfs: create a filesystem
+
+btrfsctl: control program to create snapshots and subvolumes:
+
+ mount /dev/sda2 /mnt
+ btrfsctl -s new_subvol_name /mnt
+ btrfsctl -s snapshot_of_default /mnt/default
+ btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
+ btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
+ ls /mnt
+ default snapshot_of_a_snapshot snapshot_of_new_subvol
+ new_subvol_name snapshot_of_default
+
+ Snapshots and subvolumes cannot be deleted right now, but you can
+ rm -rf all the files and directories inside them.
+
+btrfsck: do a limited check of the FS extent trees.
+
+btrfs-debug-tree: print all of the FS metadata in text form. Example:
+
+ btrfs-debug-tree /dev/sda2 >& big_output_file
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644
index 00000000000..382d52cdaf2
--- /dev/null
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -0,0 +1,658 @@
+ ==========================
+ FS-CACHE CACHE BACKEND API
+ ==========================
+
+The FS-Cache system provides an API by which actual caches can be supplied to
+FS-Cache for it to then serve out to network filesystems and other interested
+parties.
+
+This API is declared in <linux/fscache-cache.h>.
+
+
+====================================
+INITIALISING AND REGISTERING A CACHE
+====================================
+
+To start off, a cache definition must be initialised and registered for each
+cache the backend wants to make available. For instance, CacheFS does this in
+the fill_super() operation on mounting.
+
+The cache definition (struct fscache_cache) should be initialised by calling:
+
+ void fscache_init_cache(struct fscache_cache *cache,
+ struct fscache_cache_ops *ops,
+ const char *idfmt,
+ ...);
+
+Where:
+
+ (*) "cache" is a pointer to the cache definition;
+
+ (*) "ops" is a pointer to the table of operations that the backend supports on
+ this cache; and
+
+ (*) "idfmt" is a format and printf-style arguments for constructing a label
+ for the cache.
+
+
+The cache should then be registered with FS-Cache by passing a pointer to the
+previously initialised cache definition to:
+
+ int fscache_add_cache(struct fscache_cache *cache,
+ struct fscache_object *fsdef,
+ const char *tagname);
+
+Two extra arguments should also be supplied:
+
+ (*) "fsdef" which should point to the object representation for the FS-Cache
+ master index in this cache. Netfs primary index entries will be created
+ here. FS-Cache keeps the caller's reference to the index object if
+ successful and will release it upon withdrawal of the cache.
+
+ (*) "tagname" which, if given, should be a text string naming this cache. If
+ this is NULL, the identifier will be used instead. For CacheFS, the
+ identifier is set to name the underlying block device and the tag can be
+ supplied by mount.
+
+This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
+is already in use. 0 will be returned on success.
+
+
+=====================
+UNREGISTERING A CACHE
+=====================
+
+A cache can be withdrawn from the system by calling this function with a
+pointer to the cache definition:
+
+ void fscache_withdraw_cache(struct fscache_cache *cache);
+
+In CacheFS's case, this is called by put_super().
+
+
+========
+SECURITY
+========
+
+The cache methods are executed one of two contexts:
+
+ (1) that of the userspace process that issued the netfs operation that caused
+ the cache method to be invoked, or
+
+ (2) that of one of the processes in the FS-Cache thread pool.
+
+In either case, this may not be an appropriate context in which to access the
+cache.
+
+The calling process's fsuid, fsgid and SELinux security identities may need to
+be masqueraded for the duration of the cache driver's access to the cache.
+This is left to the cache to handle; FS-Cache makes no effort in this regard.
+
+
+===================================
+CONTROL AND STATISTICS PRESENTATION
+===================================
+
+The cache may present data to the outside world through FS-Cache's interfaces
+in sysfs and procfs - the former for control and the latter for statistics.
+
+A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
+is enabled. This is accessible through the kobject struct fscache_cache::kobj
+and is for use by the cache as it sees fit.
+
+
+========================
+RELEVANT DATA STRUCTURES
+========================
+
+ (*) Index/Data file FS-Cache representation cookie:
+
+ struct fscache_cookie {
+ struct fscache_object_def *def;
+ struct fscache_netfs *netfs;
+ void *netfs_data;
+ ...
+ };
+
+ The fields that might be of use to the backend describe the object
+ definition, the netfs definition and the netfs's data for this cookie.
+ The object definition contain functions supplied by the netfs for loading
+ and matching index entries; these are required to provide some of the
+ cache operations.
+
+
+ (*) In-cache object representation:
+
+ struct fscache_object {
+ int debug_id;
+ enum {
+ FSCACHE_OBJECT_RECYCLING,
+ ...
+ } state;
+ spinlock_t lock
+ struct fscache_cache *cache;
+ struct fscache_cookie *cookie;
+ ...
+ };
+
+ Structures of this type should be allocated by the cache backend and
+ passed to FS-Cache when requested by the appropriate cache operation. In
+ the case of CacheFS, they're embedded in CacheFS's internal object
+ structures.
+
+ The debug_id is a simple integer that can be used in debugging messages
+ that refer to a particular object. In such a case it should be printed
+ using "OBJ%x" to be consistent with FS-Cache.
+
+ Each object contains a pointer to the cookie that represents the object it
+ is backing. An object should retired when put_object() is called if it is
+ in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be
+ initialised by calling fscache_object_init(object).
+
+
+ (*) FS-Cache operation record:
+
+ struct fscache_operation {
+ atomic_t usage;
+ struct fscache_object *object;
+ unsigned long flags;
+ #define FSCACHE_OP_EXCLUSIVE
+ void (*processor)(struct fscache_operation *op);
+ void (*release)(struct fscache_operation *op);
+ ...
+ };
+
+ FS-Cache has a pool of threads that it uses to give CPU time to the
+ various asynchronous operations that need to be done as part of driving
+ the cache. These are represented by the above structure. The processor
+ method is called to give the op CPU time, and the release method to get
+ rid of it when its usage count reaches 0.
+
+ An operation can be made exclusive upon an object by setting the
+ appropriate flag before enqueuing it with fscache_enqueue_operation(). If
+ an operation needs more processing time, it should be enqueued again.
+
+
+ (*) FS-Cache retrieval operation record:
+
+ struct fscache_retrieval {
+ struct fscache_operation op;
+ struct address_space *mapping;
+ struct list_head *to_do;
+ ...
+ };
+
+ A structure of this type is allocated by FS-Cache to record retrieval and
+ allocation requests made by the netfs. This struct is then passed to the
+ backend to do the operation. The backend may get extra refs to it by
+ calling fscache_get_retrieval() and refs may be discarded by calling
+ fscache_put_retrieval().
+
+ A retrieval operation can be used by the backend to do retrieval work. To
+ do this, the retrieval->op.processor method pointer should be set
+ appropriately by the backend and fscache_enqueue_retrieval() called to
+ submit it to the thread pool. CacheFiles, for example, uses this to queue
+ page examination when it detects PG_lock being cleared.
+
+ The to_do field is an empty list available for the cache backend to use as
+ it sees fit.
+
+
+ (*) FS-Cache storage operation record:
+
+ struct fscache_storage {
+ struct fscache_operation op;
+ pgoff_t store_limit;
+ ...
+ };
+
+ A structure of this type is allocated by FS-Cache to record outstanding
+ writes to be made. FS-Cache itself enqueues this operation and invokes
+ the write_page() method on the object at appropriate times to effect
+ storage.
+
+
+================
+CACHE OPERATIONS
+================
+
+The cache backend provides FS-Cache with a table of operations that can be
+performed on the denizens of the cache. These are held in a structure of type:
+
+ struct fscache_cache_ops
+
+ (*) Name of cache provider [mandatory]:
+
+ const char *name
+
+ This isn't strictly an operation, but should be pointed at a string naming
+ the backend.
+
+
+ (*) Allocate a new object [mandatory]:
+
+ struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+ struct fscache_cookie *cookie)
+
+ This method is used to allocate a cache object representation to back a
+ cookie in a particular cache. fscache_object_init() should be called on
+ the object to initialise it prior to returning.
+
+ This function may also be used to parse the index key to be used for
+ multiple lookup calls to turn it into a more convenient form. FS-Cache
+ will call the lookup_complete() method to allow the cache to release the
+ form once lookup is complete or aborted.
+
+
+ (*) Look up and create object [mandatory]:
+
+ void (*lookup_object)(struct fscache_object *object)
+
+ This method is used to look up an object, given that the object is already
+ allocated and attached to the cookie. This should instantiate that object
+ in the cache if it can.
+
+ The method should call fscache_object_lookup_negative() as soon as
+ possible if it determines the object doesn't exist in the cache. If the
+ object is found to exist and the netfs indicates that it is valid then
+ fscache_obtained_object() should be called once the object is in a
+ position to have data stored in it. Similarly, fscache_obtained_object()
+ should also be called once a non-present object has been created.
+
+ If a lookup error occurs, fscache_object_lookup_error() should be called
+ to abort the lookup of that object.
+
+
+ (*) Release lookup data [mandatory]:
+
+ void (*lookup_complete)(struct fscache_object *object)
+
+ This method is called to ask the cache to release any resources it was
+ using to perform a lookup.
+
+
+ (*) Increment object refcount [mandatory]:
+
+ struct fscache_object *(*grab_object)(struct fscache_object *object)
+
+ This method is called to increment the reference count on an object. It
+ may fail (for instance if the cache is being withdrawn) by returning NULL.
+ It should return the object pointer if successful.
+
+
+ (*) Lock/Unlock object [mandatory]:
+
+ void (*lock_object)(struct fscache_object *object)
+ void (*unlock_object)(struct fscache_object *object)
+
+ These methods are used to exclusively lock an object. It must be possible
+ to schedule with the lock held, so a spinlock isn't sufficient.
+
+
+ (*) Pin/Unpin object [optional]:
+
+ int (*pin_object)(struct fscache_object *object)
+ void (*unpin_object)(struct fscache_object *object)
+
+ These methods are used to pin an object into the cache. Once pinned an
+ object cannot be reclaimed to make space. Return -ENOSPC if there's not
+ enough space in the cache to permit this.
+
+
+ (*) Update object [mandatory]:
+
+ int (*update_object)(struct fscache_object *object)
+
+ This is called to update the index entry for the specified object. The
+ new information should be in object->cookie->netfs_data. This can be
+ obtained by calling object->cookie->def->get_aux()/get_attr().
+
+
+ (*) Discard object [mandatory]:
+
+ void (*drop_object)(struct fscache_object *object)
+
+ This method is called to indicate that an object has been unbound from its
+ cookie, and that the cache should release the object's resources and
+ retire it if it's in state FSCACHE_OBJECT_RECYCLING.
+
+ This method should not attempt to release any references held by the
+ caller. The caller will invoke the put_object() method as appropriate.
+
+
+ (*) Release object reference [mandatory]:
+
+ void (*put_object)(struct fscache_object *object)
+
+ This method is used to discard a reference to an object. The object may
+ be freed when all the references to it are released.
+
+
+ (*) Synchronise a cache [mandatory]:
+
+ void (*sync)(struct fscache_cache *cache)
+
+ This is called to ask the backend to synchronise a cache with its backing
+ device.
+
+
+ (*) Dissociate a cache [mandatory]:
+
+ void (*dissociate_pages)(struct fscache_cache *cache)
+
+ This is called to ask a cache to perform any page dissociations as part of
+ cache withdrawal.
+
+
+ (*) Notification that the attributes on a netfs file changed [mandatory]:
+
+ int (*attr_changed)(struct fscache_object *object);
+
+ This is called to indicate to the cache that certain attributes on a netfs
+ file have changed (for example the maximum size a file may reach). The
+ cache can read these from the netfs by calling the cookie's get_attr()
+ method.
+
+ The cache may use the file size information to reserve space on the cache.
+ It should also call fscache_set_store_limit() to indicate to FS-Cache the
+ highest byte it's willing to store for an object.
+
+ This method may return -ve if an error occurred or the cache object cannot
+ be expanded. In such a case, the object will be withdrawn from service.
+
+ This operation is run asynchronously from FS-Cache's thread pool, and
+ storage and retrieval operations from the netfs are excluded during the
+ execution of this operation.
+
+
+ (*) Reserve cache space for an object's data [optional]:
+
+ int (*reserve_space)(struct fscache_object *object, loff_t size);
+
+ This is called to request that cache space be reserved to hold the data
+ for an object and the metadata used to track it. Zero size should be
+ taken as request to cancel a reservation.
+
+ This should return 0 if successful, -ENOSPC if there isn't enough space
+ available, or -ENOMEM or -EIO on other errors.
+
+ The reservation may exceed the current size of the object, thus permitting
+ future expansion. If the amount of space consumed by an object would
+ exceed the reservation, it's permitted to refuse requests to allocate
+ pages, but not required. An object may be pruned down to its reservation
+ size if larger than that already.
+
+
+ (*) Request page be read from cache [mandatory]:
+
+ int (*read_or_alloc_page)(struct fscache_retrieval *op,
+ struct page *page,
+ gfp_t gfp)
+
+ This is called to attempt to read a netfs page from the cache, or to
+ reserve a backing block if not. FS-Cache will have done as much checking
+ as it can before calling, but most of the work belongs to the backend.
+
+ If there's no page in the cache, then -ENODATA should be returned if the
+ backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
+ didn't.
+
+ If there is suitable data in the cache, then a read operation should be
+ queued and 0 returned. When the read finishes, fscache_end_io() should be
+ called.
+
+ The fscache_mark_pages_cached() should be called for the page if any cache
+ metadata is retained. This will indicate to the netfs that the page needs
+ explicit uncaching. This operation takes a pagevec, thus allowing several
+ pages to be marked at once.
+
+ The retrieval record pointed to by op should be retained for each page
+ queued and released when I/O on the page has been formally ended.
+ fscache_get/put_retrieval() are available for this purpose.
+
+ The retrieval record may be used to get CPU time via the FS-Cache thread
+ pool. If this is desired, the op->op.processor should be set to point to
+ the appropriate processing routine, and fscache_enqueue_retrieval() should
+ be called at an appropriate point to request CPU time. For instance, the
+ retrieval routine could be enqueued upon the completion of a disk read.
+ The to_do field in the retrieval record is provided to aid in this.
+
+ If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
+ returned if possible or fscache_end_io() called with a suitable error
+ code..
+
+
+ (*) Request pages be read from cache [mandatory]:
+
+ int (*read_or_alloc_pages)(struct fscache_retrieval *op,
+ struct list_head *pages,
+ unsigned *nr_pages,
+ gfp_t gfp)
+
+ This is like the read_or_alloc_page() method, except it is handed a list
+ of pages instead of one page. Any pages on which a read operation is
+ started must be added to the page cache for the specified mapping and also
+ to the LRU. Such pages must also be removed from the pages list and
+ *nr_pages decremented per page.
+
+ If there was an error such as -ENOMEM, then that should be returned; else
+ if one or more pages couldn't be read or allocated, then -ENOBUFS should
+ be returned; else if one or more pages couldn't be read, then -ENODATA
+ should be returned. If all the pages are dispatched then 0 should be
+ returned.
+
+
+ (*) Request page be allocated in the cache [mandatory]:
+
+ int (*allocate_page)(struct fscache_retrieval *op,
+ struct page *page,
+ gfp_t gfp)
+
+ This is like the read_or_alloc_page() method, except that it shouldn't
+ read from the cache, even if there's data there that could be retrieved.
+ It should, however, set up any internal metadata required such that
+ the write_page() method can write to the cache.
+
+ If there's no backing block available, then -ENOBUFS should be returned
+ (or -ENOMEM if there were other problems). If a block is successfully
+ allocated, then the netfs page should be marked and 0 returned.
+
+
+ (*) Request pages be allocated in the cache [mandatory]:
+
+ int (*allocate_pages)(struct fscache_retrieval *op,
+ struct list_head *pages,
+ unsigned *nr_pages,
+ gfp_t gfp)
+
+ This is an multiple page version of the allocate_page() method. pages and
+ nr_pages should be treated as for the read_or_alloc_pages() method.
+
+
+ (*) Request page be written to cache [mandatory]:
+
+ int (*write_page)(struct fscache_storage *op,
+ struct page *page);
+
+ This is called to write from a page on which there was a previously
+ successful read_or_alloc_page() call or similar. FS-Cache filters out
+ pages that don't have mappings.
+
+ This method is called asynchronously from the FS-Cache thread pool. It is
+ not required to actually store anything, provided -ENODATA is then
+ returned to the next read of this page.
+
+ If an error occurred, then a negative error code should be returned,
+ otherwise zero should be returned. FS-Cache will take appropriate action
+ in response to an error, such as withdrawing this object.
+
+ If this method returns success then FS-Cache will inform the netfs
+ appropriately.
+
+
+ (*) Discard retained per-page metadata [mandatory]:
+
+ void (*uncache_page)(struct fscache_object *object, struct page *page)
+
+ This is called when a netfs page is being evicted from the pagecache. The
+ cache backend should tear down any internal representation or tracking it
+ maintains for this page.
+
+
+==================
+FS-CACHE UTILITIES
+==================
+
+FS-Cache provides some utilities that a cache backend may make use of:
+
+ (*) Note occurrence of an I/O error in a cache:
+
+ void fscache_io_error(struct fscache_cache *cache)
+
+ This tells FS-Cache that an I/O error occurred in the cache. After this
+ has been called, only resource dissociation operations (object and page
+ release) will be passed from the netfs to the cache backend for the
+ specified cache.
+
+ This does not actually withdraw the cache. That must be done separately.
+
+
+ (*) Invoke the retrieval I/O completion function:
+
+ void fscache_end_io(struct fscache_retrieval *op, struct page *page,
+ int error);
+
+ This is called to note the end of an attempt to retrieve a page. The
+ error value should be 0 if successful and an error otherwise.
+
+
+ (*) Set highest store limit:
+
+ void fscache_set_store_limit(struct fscache_object *object,
+ loff_t i_size);
+
+ This sets the limit FS-Cache imposes on the highest byte it's willing to
+ try and store for a netfs. Any page over this limit is automatically
+ rejected by fscache_read_alloc_page() and co with -ENOBUFS.
+
+
+ (*) Mark pages as being cached:
+
+ void fscache_mark_pages_cached(struct fscache_retrieval *op,
+ struct pagevec *pagevec);
+
+ This marks a set of pages as being cached. After this has been called,
+ the netfs must call fscache_uncache_page() to unmark the pages.
+
+
+ (*) Perform coherency check on an object:
+
+ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+ const void *data,
+ uint16_t datalen);
+
+ This asks the netfs to perform a coherency check on an object that has
+ just been looked up. The cookie attached to the object will determine the
+ netfs to use. data and datalen should specify where the auxiliary data
+ retrieved from the cache can be found.
+
+ One of three values will be returned:
+
+ (*) FSCACHE_CHECKAUX_OKAY
+
+ The coherency data indicates the object is valid as is.
+
+ (*) FSCACHE_CHECKAUX_NEEDS_UPDATE
+
+ The coherency data needs updating, but otherwise the object is
+ valid.
+
+ (*) FSCACHE_CHECKAUX_OBSOLETE
+
+ The coherency data indicates that the object is obsolete and should
+ be discarded.
+
+
+ (*) Initialise a freshly allocated object:
+
+ void fscache_object_init(struct fscache_object *object);
+
+ This initialises all the fields in an object representation.
+
+
+ (*) Indicate the destruction of an object:
+
+ void fscache_object_destroyed(struct fscache_cache *cache);
+
+ This must be called to inform FS-Cache that an object that belonged to a
+ cache has been destroyed and deallocated. This will allow continuation
+ of the cache withdrawal process when it is stopped pending destruction of
+ all the objects.
+
+
+ (*) Indicate negative lookup on an object:
+
+ void fscache_object_lookup_negative(struct fscache_object *object);
+
+ This is called to indicate to FS-Cache that a lookup process for an object
+ found a negative result.
+
+ This changes the state of an object to permit reads pending on lookup
+ completion to go off and start fetching data from the netfs server as it's
+ known at this point that there can't be any data in the cache.
+
+ This may be called multiple times on an object. Only the first call is
+ significant - all subsequent calls are ignored.
+
+
+ (*) Indicate an object has been obtained:
+
+ void fscache_obtained_object(struct fscache_object *object);
+
+ This is called to indicate to FS-Cache that a lookup process for an object
+ produced a positive result, or that an object was created. This should
+ only be called once for any particular object.
+
+ This changes the state of an object to indicate:
+
+ (1) if no call to fscache_object_lookup_negative() has been made on
+ this object, that there may be data available, and that reads can
+ now go and look for it; and
+
+ (2) that writes may now proceed against this object.
+
+
+ (*) Indicate that object lookup failed:
+
+ void fscache_object_lookup_error(struct fscache_object *object);
+
+ This marks an object as having encountered a fatal error (usually EIO)
+ and causes it to move into a state whereby it will be withdrawn as soon
+ as possible.
+
+
+ (*) Get and release references on a retrieval record:
+
+ void fscache_get_retrieval(struct fscache_retrieval *op);
+ void fscache_put_retrieval(struct fscache_retrieval *op);
+
+ These two functions are used to retain a retrieval record whilst doing
+ asynchronous data retrieval and block allocation.
+
+
+ (*) Enqueue a retrieval record for processing.
+
+ void fscache_enqueue_retrieval(struct fscache_retrieval *op);
+
+ This enqueues a retrieval record for processing by the FS-Cache thread
+ pool. One of the threads in the pool will invoke the retrieval record's
+ op->op.processor callback function. This function may be called from
+ within the callback function.
+
+
+ (*) List of object state names:
+
+ const char *fscache_object_states[];
+
+ For debugging purposes, this may be used to turn the state that an object
+ is in into a text string for display purposes.
diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt
new file mode 100644
index 00000000000..748a1ae49e1
--- /dev/null
+++ b/Documentation/filesystems/caching/cachefiles.txt
@@ -0,0 +1,501 @@
+ ===============================================
+ CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM
+ ===============================================
+
+Contents:
+
+ (*) Overview.
+
+ (*) Requirements.
+
+ (*) Configuration.
+
+ (*) Starting the cache.
+
+ (*) Things to avoid.
+
+ (*) Cache culling.
+
+ (*) Cache structure.
+
+ (*) Security model and SELinux.
+
+ (*) A note on security.
+
+ (*) Statistical information.
+
+ (*) Debugging.
+
+
+========
+OVERVIEW
+========
+
+CacheFiles is a caching backend that's meant to use as a cache a directory on
+an already mounted filesystem of a local type (such as Ext3).
+
+CacheFiles uses a userspace daemon to do some of the cache management - such as
+reaping stale nodes and culling. This is called cachefilesd and lives in
+/sbin.
+
+The filesystem and data integrity of the cache are only as good as those of the
+filesystem providing the backing services. Note that CacheFiles does not
+attempt to journal anything since the journalling interfaces of the various
+filesystems are very specific in nature.
+
+CacheFiles creates a misc character device - "/dev/cachefiles" - that is used
+to communication with the daemon. Only one thing may have this open at once,
+and whilst it is open, a cache is at least partially in existence. The daemon
+opens this and sends commands down it to control the cache.
+
+CacheFiles is currently limited to a single cache.
+
+CacheFiles attempts to maintain at least a certain percentage of free space on
+the filesystem, shrinking the cache by culling the objects it contains to make
+space if necessary - see the "Cache Culling" section. This means it can be
+placed on the same medium as a live set of data, and will expand to make use of
+spare space and automatically contract when the set of data requires more
+space.
+
+
+============
+REQUIREMENTS
+============
+
+The use of CacheFiles and its daemon requires the following features to be
+available in the system and in the cache filesystem:
+
+ - dnotify.
+
+ - extended attributes (xattrs).
+
+ - openat() and friends.
+
+ - bmap() support on files in the filesystem (FIBMAP ioctl).
+
+ - The use of bmap() to detect a partial page at the end of the file.
+
+It is strongly recommended that the "dir_index" option is enabled on Ext3
+filesystems being used as a cache.
+
+
+=============
+CONFIGURATION
+=============
+
+The cache is configured by a script in /etc/cachefilesd.conf. These commands
+set up cache ready for use. The following script commands are available:
+
+ (*) brun <N>%
+ (*) bcull <N>%
+ (*) bstop <N>%
+ (*) frun <N>%
+ (*) fcull <N>%
+ (*) fstop <N>%
+
+ Configure the culling limits. Optional. See the section on culling
+ The defaults are 7% (run), 5% (cull) and 1% (stop) respectively.
+
+ The commands beginning with a 'b' are file space (block) limits, those
+ beginning with an 'f' are file count limits.
+
+ (*) dir <path>
+
+ Specify the directory containing the root of the cache. Mandatory.
+
+ (*) tag <name>
+
+ Specify a tag to FS-Cache to use in distinguishing multiple caches.
+ Optional. The default is "CacheFiles".
+
+ (*) debug <mask>
+
+ Specify a numeric bitmask to control debugging in the kernel module.
+ Optional. The default is zero (all off). The following values can be
+ OR'd into the mask to collect various information:
+
+ 1 Turn on trace of function entry (_enter() macros)
+ 2 Turn on trace of function exit (_leave() macros)
+ 4 Turn on trace of internal debug points (_debug())
+
+ This mask can also be set through sysfs, eg:
+
+ echo 5 >/sys/modules/cachefiles/parameters/debug
+
+
+==================
+STARTING THE CACHE
+==================
+
+The cache is started by running the daemon. The daemon opens the cache device,
+configures the cache and tells it to begin caching. At that point the cache
+binds to fscache and the cache becomes live.
+
+The daemon is run as follows:
+
+ /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>]
+
+The flags are:
+
+ (*) -d
+
+ Increase the debugging level. This can be specified multiple times and
+ is cumulative with itself.
+
+ (*) -s
+
+ Send messages to stderr instead of syslog.
+
+ (*) -n
+
+ Don't daemonise and go into background.
+
+ (*) -f <configfile>
+
+ Use an alternative configuration file rather than the default one.
+
+
+===============
+THINGS TO AVOID
+===============
+
+Do not mount other things within the cache as this will cause problems. The
+kernel module contains its own very cut-down path walking facility that ignores
+mountpoints, but the daemon can't avoid them.
+
+Do not create, rename or unlink files and directories in the cache whilst the
+cache is active, as this may cause the state to become uncertain.
+
+Renaming files in the cache might make objects appear to be other objects (the
+filename is part of the lookup key).
+
+Do not change or remove the extended attributes attached to cache files by the
+cache as this will cause the cache state management to get confused.
+
+Do not create files or directories in the cache, lest the cache get confused or
+serve incorrect data.
+
+Do not chmod files in the cache. The module creates things with minimal
+permissions to prevent random users being able to access them directly.
+
+
+=============
+CACHE CULLING
+=============
+
+The cache may need culling occasionally to make space. This involves
+discarding objects from the cache that have been used less recently than
+anything else. Culling is based on the access time of data objects. Empty
+directories are culled if not in use.
+
+Cache culling is done on the basis of the percentage of blocks and the
+percentage of files available in the underlying filesystem. There are six
+"limits":
+
+ (*) brun
+ (*) frun
+
+ If the amount of free space and the number of available files in the cache
+ rises above both these limits, then culling is turned off.
+
+ (*) bcull
+ (*) fcull
+
+ If the amount of available space or the number of available files in the
+ cache falls below either of these limits, then culling is started.
+
+ (*) bstop
+ (*) fstop
+
+ If the amount of available space or the number of available files in the
+ cache falls below either of these limits, then no further allocation of
+ disk space or files is permitted until culling has raised things above
+ these limits again.
+
+These must be configured thusly:
+
+ 0 <= bstop < bcull < brun < 100
+ 0 <= fstop < fcull < frun < 100
+
+Note that these are percentages of available space and available files, and do
+_not_ appear as 100 minus the percentage displayed by the "df" program.
+
+The userspace daemon scans the cache to build up a table of cullable objects.
+These are then culled in least recently used order. A new scan of the cache is
+started as soon as space is made in the table. Objects will be skipped if
+their atimes have changed or if the kernel module says it is still using them.
+
+
+===============
+CACHE STRUCTURE
+===============
+
+The CacheFiles module will create two directories in the directory it was
+given:
+
+ (*) cache/
+
+ (*) graveyard/
+
+The active cache objects all reside in the first directory. The CacheFiles
+kernel module moves any retired or culled objects that it can't simply unlink
+to the graveyard from which the daemon will actually delete them.
+
+The daemon uses dnotify to monitor the graveyard directory, and will delete
+anything that appears therein.
+
+
+The module represents index objects as directories with the filename "I..." or
+"J...". Note that the "cache/" directory is itself a special index.
+
+Data objects are represented as files if they have no children, or directories
+if they do. Their filenames all begin "D..." or "E...". If represented as a
+directory, data objects will have a file in the directory called "data" that
+actually holds the data.
+
+Special objects are similar to data objects, except their filenames begin
+"S..." or "T...".
+
+
+If an object has children, then it will be represented as a directory.
+Immediately in the representative directory are a collection of directories
+named for hash values of the child object keys with an '@' prepended. Into
+this directory, if possible, will be placed the representations of the child
+objects:
+
+ INDEX INDEX INDEX DATA FILES
+ ========= ========== ================================= ================
+ cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400
+ cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry
+ cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry
+ cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry
+
+
+If the key is so long that it exceeds NAME_MAX with the decorations added on to
+it, then it will be cut into pieces, the first few of which will be used to
+make a nest of directories, and the last one of which will be the objects
+inside the last directory. The names of the intermediate directories will have
+'+' prepended:
+
+ J1223/@23/+xy...z/+kl...m/Epqr
+
+
+Note that keys are raw data, and not only may they exceed NAME_MAX in size,
+they may also contain things like '/' and NUL characters, and so they may not
+be suitable for turning directly into a filename.
+
+To handle this, CacheFiles will use a suitably printable filename directly and
+"base-64" encode ones that aren't directly suitable. The two versions of
+object filenames indicate the encoding:
+
+ OBJECT TYPE PRINTABLE ENCODED
+ =============== =============== ===============
+ Index "I..." "J..."
+ Data "D..." "E..."
+ Special "S..." "T..."
+
+Intermediate directories are always "@" or "+" as appropriate.
+
+
+Each object in the cache has an extended attribute label that holds the object
+type ID (required to distinguish special objects) and the auxiliary data from
+the netfs. The latter is used to detect stale objects in the cache and update
+or retire them.
+
+
+Note that CacheFiles will erase from the cache any file it doesn't recognise or
+any file of an incorrect type (such as a FIFO file or a device file).
+
+
+==========================
+SECURITY MODEL AND SELINUX
+==========================
+
+CacheFiles is implemented to deal properly with the LSM security features of
+the Linux kernel and the SELinux facility.
+
+One of the problems that CacheFiles faces is that it is generally acting on
+behalf of a process, and running in that process's context, and that includes a
+security context that is not appropriate for accessing the cache - either
+because the files in the cache are inaccessible to that process, or because if
+the process creates a file in the cache, that file may be inaccessible to other
+processes.
+
+The way CacheFiles works is to temporarily change the security context (fsuid,
+fsgid and actor security label) that the process acts as - without changing the
+security context of the process when it the target of an operation performed by
+some other process (so signalling and suchlike still work correctly).
+
+
+When the CacheFiles module is asked to bind to its cache, it:
+
+ (1) Finds the security label attached to the root cache directory and uses
+ that as the security label with which it will create files. By default,
+ this is:
+
+ cachefiles_var_t
+
+ (2) Finds the security label of the process which issued the bind request
+ (presumed to be the cachefilesd daemon), which by default will be:
+
+ cachefilesd_t
+
+ and asks LSM to supply a security ID as which it should act given the
+ daemon's label. By default, this will be:
+
+ cachefiles_kernel_t
+
+ SELinux transitions the daemon's security ID to the module's security ID
+ based on a rule of this form in the policy.
+
+ type_transition <daemon's-ID> kernel_t : process <module's-ID>;
+
+ For instance:
+
+ type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t;
+
+
+The module's security ID gives it permission to create, move and remove files
+and directories in the cache, to find and access directories and files in the
+cache, to set and access extended attributes on cache objects, and to read and
+write files in the cache.
+
+The daemon's security ID gives it only a very restricted set of permissions: it
+may scan directories, stat files and erase files and directories. It may
+not read or write files in the cache, and so it is precluded from accessing the
+data cached therein; nor is it permitted to create new files in the cache.
+
+
+There are policy source files available in:
+
+ http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2
+
+and later versions. In that tarball, see the files:
+
+ cachefilesd.te
+ cachefilesd.fc
+ cachefilesd.if
+
+They are built and installed directly by the RPM.
+
+If a non-RPM based system is being used, then copy the above files to their own
+directory and run:
+
+ make -f /usr/share/selinux/devel/Makefile
+ semodule -i cachefilesd.pp
+
+You will need checkpolicy and selinux-policy-devel installed prior to the
+build.
+
+
+By default, the cache is located in /var/fscache, but if it is desirable that
+it should be elsewhere, than either the above policy files must be altered, or
+an auxiliary policy must be installed to label the alternate location of the
+cache.
+
+For instructions on how to add an auxiliary policy to enable the cache to be
+located elsewhere when SELinux is in enforcing mode, please see:
+
+ /usr/share/doc/cachefilesd-*/move-cache.txt
+
+When the cachefilesd rpm is installed; alternatively, the document can be found
+in the sources.
+
+
+==================
+A NOTE ON SECURITY
+==================
+
+CacheFiles makes use of the split security in the task_struct. It allocates
+its own task_security structure, and redirects current->cred to point to it
+when it acts on behalf of another process, in that process's context.
+
+The reason it does this is that it calls vfs_mkdir() and suchlike rather than
+bypassing security and calling inode ops directly. Therefore the VFS and LSM
+may deny the CacheFiles access to the cache data because under some
+circumstances the caching code is running in the security context of whatever
+process issued the original syscall on the netfs.
+
+Furthermore, should CacheFiles create a file or directory, the security
+parameters with that object is created (UID, GID, security label) would be
+derived from that process that issued the system call, thus potentially
+preventing other processes from accessing the cache - including CacheFiles's
+cache management daemon (cachefilesd).
+
+What is required is to temporarily override the security of the process that
+issued the system call. We can't, however, just do an in-place change of the
+security data as that affects the process as an object, not just as a subject.
+This means it may lose signals or ptrace events for example, and affects what
+the process looks like in /proc.
+
+So CacheFiles makes use of a logical split in the security between the
+objective security (task->real_cred) and the subjective security (task->cred).
+The objective security holds the intrinsic security properties of a process and
+is never overridden. This is what appears in /proc, and is what is used when a
+process is the target of an operation by some other process (SIGKILL for
+example).
+
+The subjective security holds the active security properties of a process, and
+may be overridden. This is not seen externally, and is used whan a process
+acts upon another object, for example SIGKILLing another process or opening a
+file.
+
+LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request
+for CacheFiles to run in a context of a specific security label, or to create
+files and directories with another security label.
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following option enabled:
+
+ CONFIG_CACHEFILES_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a proc file.
+
+ (*) /proc/fs/cachefiles/histogram
+
+ cat /proc/fs/cachefiles/histogram
+ JIFS SECS LOOKUPS MKDIRS CREATES
+ ===== ===== ========= ========= =========
+
+ This shows the breakdown of the number of times each amount of time
+ between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
+ columns are as follows:
+
+ COLUMN TIME MEASUREMENT
+ ======= =======================================================
+ LOOKUPS Length of time to perform a lookup on the backing fs
+ MKDIRS Length of time to perform a mkdir on the backing fs
+ CREATES Length of time to perform a create on the backing fs
+
+ Each row shows the number of events that took a particular range of times.
+ Each step is 1 jiffy in size. The JIFS column indicates the particular
+ jiffy range covered, and the SECS field the equivalent number of seconds.
+
+
+=========
+DEBUGGING
+=========
+
+If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime
+debugging enabled by adjusting the value in:
+
+ /sys/module/cachefiles/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+ BIT VALUE STREAM POINT
+ ======= ======= =============================== =======================
+ 0 1 General Function entry trace
+ 1 2 Function exit trace
+ 2 4 General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file. For example:
+
+ echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
+
+will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644
index 00000000000..770267af5b3
--- /dev/null
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -0,0 +1,443 @@
+ ==========================
+ General Filesystem Caching
+ ==========================
+
+========
+OVERVIEW
+========
+
+This facility is a general purpose cache for network filesystems, though it
+could be used for caching other things such as ISO9660 filesystems too.
+
+FS-Cache mediates between cache backends (such as CacheFS) and network
+filesystems:
+
+ +---------+
+ | | +--------------+
+ | NFS |--+ | |
+ | | | +-->| CacheFS |
+ +---------+ | +----------+ | | /dev/hda5 |
+ | | | | +--------------+
+ +---------+ +-->| | |
+ | | | |--+
+ | AFS |----->| FS-Cache |
+ | | | |--+
+ +---------+ +-->| | |
+ | | | | +--------------+
+ +---------+ | +----------+ | | |
+ | | | +-->| CacheFiles |
+ | ISOFS |--+ | /var/cache |
+ | | +--------------+
+ +---------+
+
+Or to look at it another way, FS-Cache is a module that provides a caching
+facility to a network filesystem such that the cache is transparent to the
+user:
+
+ +---------+
+ | |
+ | Server |
+ | |
+ +---------+
+ | NETWORK
+ ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ |
+ | +----------+
+ V | |
+ +---------+ | |
+ | | | |
+ | NFS |----->| FS-Cache |
+ | | | |--+
+ +---------+ | | | +--------------+ +--------------+
+ | | | | | | | |
+ V +----------+ +-->| CacheFiles |-->| Ext3 |
+ +---------+ | /var/cache | | /dev/sda6 |
+ | | +--------------+ +--------------+
+ | VFS | ^ ^
+ | | | |
+ +---------+ +--------------+ |
+ | KERNEL SPACE | |
+ ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
+ | USER SPACE | |
+ V | |
+ +---------+ +--------------+
+ | | | |
+ | Process | | cachefilesd |
+ | | | |
+ +---------+ +--------------+
+
+
+FS-Cache does not follow the idea of completely loading every netfs file
+opened in its entirety into a cache before permitting it to be accessed and
+then serving the pages out of that cache rather than the netfs inode because:
+
+ (1) It must be practical to operate without a cache.
+
+ (2) The size of any accessible file must not be limited to the size of the
+ cache.
+
+ (3) The combined size of all opened files (this includes mapped libraries)
+ must not be limited to the size of the cache.
+
+ (4) The user should not be forced to download an entire file just to do a
+ one-off access of a small portion of it (such as might be done with the
+ "file" program).
+
+It instead serves the cache out in PAGE_SIZE chunks as and when requested by
+the netfs('s) using it.
+
+
+FS-Cache provides the following facilities:
+
+ (1) More than one cache can be used at once. Caches can be selected
+ explicitly by use of tags.
+
+ (2) Caches can be added / removed at any time.
+
+ (3) The netfs is provided with an interface that allows either party to
+ withdraw caching facilities from a file (required for (2)).
+
+ (4) The interface to the netfs returns as few errors as possible, preferring
+ rather to let the netfs remain oblivious.
+
+ (5) Cookies are used to represent indices, files and other objects to the
+ netfs. The simplest cookie is just a NULL pointer - indicating nothing
+ cached there.
+
+ (6) The netfs is allowed to propose - dynamically - any index hierarchy it
+ desires, though it must be aware that the index search function is
+ recursive, stack space is limited, and indices can only be children of
+ indices.
+
+ (7) Data I/O is done direct to and from the netfs's pages. The netfs
+ indicates that page A is at index B of the data-file represented by cookie
+ C, and that it should be read or written. The cache backend may or may
+ not start I/O on that page, but if it does, a netfs callback will be
+ invoked to indicate completion. The I/O may be either synchronous or
+ asynchronous.
+
+ (8) Cookies can be "retired" upon release. At this point FS-Cache will mark
+ them as obsolete and the index hierarchy rooted at that point will get
+ recycled.
+
+ (9) The netfs provides a "match" function for index searches. In addition to
+ saying whether a match was made or not, this can also specify that an
+ entry should be updated or deleted.
+
+(10) As much as possible is done asynchronously.
+
+
+FS-Cache maintains a virtual indexing tree in which all indices, files, objects
+and pages are kept. Bits of this tree may actually reside in one or more
+caches.
+
+ FSDEF
+ |
+ +------------------------------------+
+ | |
+ NFS AFS
+ | |
+ +--------------------------+ +-----------+
+ | | | |
+ homedir mirror afs.org redhat.com
+ | | |
+ +------------+ +---------------+ +----------+
+ | | | | | |
+ 00001 00002 00007 00125 vol00001 vol00002
+ | | | | |
+ +---+---+ +-----+ +---+ +------+------+ +-----+----+
+ | | | | | | | | | | | | |
+PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak
+ | |
+ PG0 +-------+
+ | |
+ 00001 00003
+ |
+ +---+---+
+ | | |
+ PG0 PG1 PG2
+
+In the example above, you can see two netfs's being backed: NFS and AFS. These
+have different index hierarchies:
+
+ (*) The NFS primary index contains per-server indices. Each server index is
+ indexed by NFS file handles to get data file objects. Each data file
+ objects can have an array of pages, but may also have further child
+ objects, such as extended attributes and directory entries. Extended
+ attribute objects themselves have page-array contents.
+
+ (*) The AFS primary index contains per-cell indices. Each cell index contains
+ per-logical-volume indices. Each of volume index contains up to three
+ indices for the read-write, read-only and backup mirrors of those volumes.
+ Each of these contains vnode data file objects, each of which contains an
+ array of pages.
+
+The very top index is the FS-Cache master index in which individual netfs's
+have entries.
+
+Any index object may reside in more than one cache, provided it only has index
+children. Any index with non-index object children will be assumed to only
+reside in one cache.
+
+
+The netfs API to FS-Cache can be found in:
+
+ Documentation/filesystems/caching/netfs-api.txt
+
+The cache backend API to FS-Cache can be found in:
+
+ Documentation/filesystems/caching/backend-api.txt
+
+A description of the internal representations and object state machine can be
+found in:
+
+ Documentation/filesystems/caching/object.txt
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following options enabled:
+
+ CONFIG_FSCACHE_STATS=y
+ CONFIG_FSCACHE_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a number of
+proc files.
+
+ (*) /proc/fs/fscache/stats
+
+ This shows counts of a number of events that can happen in FS-Cache:
+
+ CLASS EVENT MEANING
+ ======= ======= =======================================================
+ Cookies idx=N Number of index cookies allocated
+ dat=N Number of data storage cookies allocated
+ spc=N Number of special cookies allocated
+ Objects alc=N Number of objects allocated
+ nal=N Number of object allocation failures
+ avl=N Number of objects that reached the available state
+ ded=N Number of objects that reached the dead state
+ ChkAux non=N Number of objects that didn't have a coherency check
+ ok=N Number of objects that passed a coherency check
+ upd=N Number of objects that needed a coherency data update
+ obs=N Number of objects that were declared obsolete
+ Pages mrk=N Number of pages marked as being cached
+ unc=N Number of uncache page requests seen
+ Acquire n=N Number of acquire cookie requests seen
+ nul=N Number of acq reqs given a NULL parent
+ noc=N Number of acq reqs rejected due to no cache available
+ ok=N Number of acq reqs succeeded
+ nbf=N Number of acq reqs rejected due to error
+ oom=N Number of acq reqs failed on ENOMEM
+ Lookups n=N Number of lookup calls made on cache backends
+ neg=N Number of negative lookups made
+ pos=N Number of positive lookups made
+ crt=N Number of objects created by lookup
+ tmo=N Number of lookups timed out and requeued
+ Updates n=N Number of update cookie requests seen
+ nul=N Number of upd reqs given a NULL parent
+ run=N Number of upd reqs granted CPU time
+ Relinqs n=N Number of relinquish cookie requests seen
+ nul=N Number of rlq reqs given a NULL parent
+ wcr=N Number of rlq reqs waited on completion of creation
+ AttrChg n=N Number of attribute changed requests seen
+ ok=N Number of attr changed requests queued
+ nbf=N Number of attr changed rejected -ENOBUFS
+ oom=N Number of attr changed failed -ENOMEM
+ run=N Number of attr changed ops given CPU time
+ Allocs n=N Number of allocation requests seen
+ ok=N Number of successful alloc reqs
+ wt=N Number of alloc reqs that waited on lookup completion
+ nbf=N Number of alloc reqs rejected -ENOBUFS
+ int=N Number of alloc reqs aborted -ERESTARTSYS
+ ops=N Number of alloc reqs submitted
+ owt=N Number of alloc reqs waited for CPU time
+ abt=N Number of alloc reqs aborted due to object death
+ Retrvls n=N Number of retrieval (read) requests seen
+ ok=N Number of successful retr reqs
+ wt=N Number of retr reqs that waited on lookup completion
+ nod=N Number of retr reqs returned -ENODATA
+ nbf=N Number of retr reqs rejected -ENOBUFS
+ int=N Number of retr reqs aborted -ERESTARTSYS
+ oom=N Number of retr reqs failed -ENOMEM
+ ops=N Number of retr reqs submitted
+ owt=N Number of retr reqs waited for CPU time
+ abt=N Number of retr reqs aborted due to object death
+ Stores n=N Number of storage (write) requests seen
+ ok=N Number of successful store reqs
+ agn=N Number of store reqs on a page already pending storage
+ nbf=N Number of store reqs rejected -ENOBUFS
+ oom=N Number of store reqs failed -ENOMEM
+ ops=N Number of store reqs submitted
+ run=N Number of store reqs granted CPU time
+ pgs=N Number of pages given store req processing time
+ rxd=N Number of store reqs deleted from tracking tree
+ olm=N Number of store reqs over store limit
+ VmScan nos=N Number of release reqs against pages with no pending store
+ gon=N Number of release reqs against pages stored by time lock granted
+ bsy=N Number of release reqs ignored due to in-progress store
+ can=N Number of page stores cancelled due to release req
+ Ops pend=N Number of times async ops added to pending queues
+ run=N Number of times async ops given CPU time
+ enq=N Number of times async ops queued for processing
+ can=N Number of async ops cancelled
+ rej=N Number of async ops rejected due to object lookup/create failure
+ dfr=N Number of async ops queued for deferred release
+ rel=N Number of async ops released
+ gc=N Number of deferred-release async ops garbage collected
+ CacheOp alo=N Number of in-progress alloc_object() cache ops
+ luo=N Number of in-progress lookup_object() cache ops
+ luc=N Number of in-progress lookup_complete() cache ops
+ gro=N Number of in-progress grab_object() cache ops
+ upo=N Number of in-progress update_object() cache ops
+ dro=N Number of in-progress drop_object() cache ops
+ pto=N Number of in-progress put_object() cache ops
+ syn=N Number of in-progress sync_cache() cache ops
+ atc=N Number of in-progress attr_changed() cache ops
+ rap=N Number of in-progress read_or_alloc_page() cache ops
+ ras=N Number of in-progress read_or_alloc_pages() cache ops
+ alp=N Number of in-progress allocate_page() cache ops
+ als=N Number of in-progress allocate_pages() cache ops
+ wrp=N Number of in-progress write_page() cache ops
+ ucp=N Number of in-progress uncache_page() cache ops
+ dsp=N Number of in-progress dissociate_pages() cache ops
+
+
+ (*) /proc/fs/fscache/histogram
+
+ cat /proc/fs/fscache/histogram
+ JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS
+ ===== ===== ========= ========= ========= ========= =========
+
+ This shows the breakdown of the number of times each amount of time
+ between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The
+ columns are as follows:
+
+ COLUMN TIME MEASUREMENT
+ ======= =======================================================
+ OBJ INST Length of time to instantiate an object
+ OP RUNS Length of time a call to process an operation took
+ OBJ RUNS Length of time a call to process an object event took
+ RETRV DLY Time between an requesting a read and lookup completing
+ RETRIEVLS Time between beginning and end of a retrieval
+
+ Each row shows the number of events that took a particular range of times.
+ Each step is 1 jiffy in size. The JIFS column indicates the particular
+ jiffy range covered, and the SECS field the equivalent number of seconds.
+
+
+===========
+OBJECT LIST
+===========
+
+If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
+list of all the objects currently allocated and allow them to be viewed
+through:
+
+ /proc/fs/fscache/objects
+
+This will look something like:
+
+ [root@andromeda ~]# head /proc/fs/fscache/objects
+ OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA
+ ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
+ 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+ 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+
+where the first set of columns before the '|' describe the object:
+
+ COLUMN DESCRIPTION
+ ======= ===============================================================
+ OBJECT Object debugging ID (appears as OBJ%x in some debug messages)
+ PARENT Debugging ID of parent object
+ STAT Object state
+ CHLDN Number of child objects of this object
+ OPS Number of outstanding operations on this object
+ OOP Number of outstanding child object management operations
+ IPR
+ EX Number of outstanding exclusive operations
+ READS Number of outstanding read operations
+ EM Object's event mask
+ EV Events raised on this object
+ F Object flags
+ S Object work item busy state mask (1:pending 2:running)
+
+and the second set of columns describe the object's cookie, if present:
+
+ COLUMN DESCRIPTION
+ =============== =======================================================
+ NETFS_COOKIE_DEF Name of netfs cookie definition
+ TY Cookie type (IX - index, DT - data, hex - special)
+ FL Cookie flags
+ NETFS_DATA Netfs private data stored in the cookie
+ OBJECT_KEY Object key } 1 column, with separating comma
+ AUX_DATA Object aux data } presence may be configured
+
+The data shown may be filtered by attaching the a key to an appropriate keyring
+before viewing the file. Something like:
+
+ keyctl add user fscache:objlist <restrictions> @s
+
+where <restrictions> are a selection of the following letters:
+
+ K Show hexdump of object key (don't show if not given)
+ A Show hexdump of object aux data (don't show if not given)
+
+and the following paired letters:
+
+ C Show objects that have a cookie
+ c Show objects that don't have a cookie
+ B Show objects that are busy
+ b Show objects that aren't busy
+ W Show objects that have pending writes
+ w Show objects that don't have pending writes
+ R Show objects that have outstanding reads
+ r Show objects that don't have outstanding reads
+ S Show objects that have work queued
+ s Show objects that don't have work queued
+
+If neither side of a letter pair is given, then both are implied. For example:
+
+ keyctl add user fscache:objlist KB @s
+
+shows objects that are busy, and lists their object keys, but does not dump
+their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is
+not implied.
+
+By default all objects and all fields will be shown.
+
+
+=========
+DEBUGGING
+=========
+
+If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
+debugging enabled by adjusting the value in:
+
+ /sys/module/fscache/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+ BIT VALUE STREAM POINT
+ ======= ======= =============================== =======================
+ 0 1 Cache management Function entry trace
+ 1 2 Function exit trace
+ 2 4 General
+ 3 8 Cookie management Function entry trace
+ 4 16 Function exit trace
+ 5 32 General
+ 6 64 Page handling Function entry trace
+ 7 128 Function exit trace
+ 8 256 General
+ 9 512 Operation management Function entry trace
+ 10 1024 Function exit trace
+ 11 2048 General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file. For example:
+
+ echo $((1|8|64)) >/sys/module/fscache/parameters/debug
+
+will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644
index 00000000000..7cc6bf2871e
--- /dev/null
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -0,0 +1,813 @@
+ ===============================
+ FS-CACHE NETWORK FILESYSTEM API
+ ===============================
+
+There's an API by which a network filesystem can make use of the FS-Cache
+facilities. This is based around a number of principles:
+
+ (1) Caches can store a number of different object types. There are two main
+ object types: indices and files. The first is a special type used by
+ FS-Cache to make finding objects faster and to make retiring of groups of
+ objects easier.
+
+ (2) Every index, file or other object is represented by a cookie. This cookie
+ may or may not have anything associated with it, but the netfs doesn't
+ need to care.
+
+ (3) Barring the top-level index (one entry per cached netfs), the index
+ hierarchy for each netfs is structured according the whim of the netfs.
+
+This API is declared in <linux/fscache.h>.
+
+This document contains the following sections:
+
+ (1) Network filesystem definition
+ (2) Index definition
+ (3) Object definition
+ (4) Network filesystem (un)registration
+ (5) Cache tag lookup
+ (6) Index registration
+ (7) Data file registration
+ (8) Miscellaneous object registration
+ (9) Setting the data file size
+ (10) Page alloc/read/write
+ (11) Page uncaching
+ (12) Index and data file update
+ (13) Miscellaneous cookie operations
+ (14) Cookie unregistration
+ (15) Index and data file invalidation
+ (16) FS-Cache specific page flags.
+
+
+=============================
+NETWORK FILESYSTEM DEFINITION
+=============================
+
+FS-Cache needs a description of the network filesystem. This is specified
+using a record of the following structure:
+
+ struct fscache_netfs {
+ uint32_t version;
+ const char *name;
+ struct fscache_cookie *primary_index;
+ ...
+ };
+
+This first two fields should be filled in before registration, and the third
+will be filled in by the registration function; any other fields should just be
+ignored and are for internal use only.
+
+The fields are:
+
+ (1) The name of the netfs (used as the key in the toplevel index).
+
+ (2) The version of the netfs (if the name matches but the version doesn't, the
+ entire in-cache hierarchy for this netfs will be scrapped and begun
+ afresh).
+
+ (3) The cookie representing the primary index will be allocated according to
+ another parameter passed into the registration function.
+
+For example, kAFS (linux/fs/afs/) uses the following definitions to describe
+itself:
+
+ struct fscache_netfs afs_cache_netfs = {
+ .version = 0,
+ .name = "afs",
+ };
+
+
+================
+INDEX DEFINITION
+================
+
+Indices are used for two purposes:
+
+ (1) To aid the finding of a file based on a series of keys (such as AFS's
+ "cell", "volume ID", "vnode ID").
+
+ (2) To make it easier to discard a subset of all the files cached based around
+ a particular key - for instance to mirror the removal of an AFS volume.
+
+However, since it's unlikely that any two netfs's are going to want to define
+their index hierarchies in quite the same way, FS-Cache tries to impose as few
+restraints as possible on how an index is structured and where it is placed in
+the tree. The netfs can even mix indices and data files at the same level, but
+it's not recommended.
+
+Each index entry consists of a key of indeterminate length plus some auxiliary
+data, also of indeterminate length.
+
+There are some limits on indices:
+
+ (1) Any index containing non-index objects should be restricted to a single
+ cache. Any such objects created within an index will be created in the
+ first cache only. The cache in which an index is created can be
+ controlled by cache tags (see below).
+
+ (2) The entry data must be atomically journallable, so it is limited to about
+ 400 bytes at present. At least 400 bytes will be available.
+
+ (3) The depth of the index tree should be judged with care as the search
+ function is recursive. Too many layers will run the kernel out of stack.
+
+
+=================
+OBJECT DEFINITION
+=================
+
+To define an object, a structure of the following type should be filled out:
+
+ struct fscache_cookie_def
+ {
+ uint8_t name[16];
+ uint8_t type;
+
+ struct fscache_cache_tag *(*select_cache)(
+ const void *parent_netfs_data,
+ const void *cookie_netfs_data);
+
+ uint16_t (*get_key)(const void *cookie_netfs_data,
+ void *buffer,
+ uint16_t bufmax);
+
+ void (*get_attr)(const void *cookie_netfs_data,
+ uint64_t *size);
+
+ uint16_t (*get_aux)(const void *cookie_netfs_data,
+ void *buffer,
+ uint16_t bufmax);
+
+ enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+ const void *data,
+ uint16_t datalen);
+
+ void (*get_context)(void *cookie_netfs_data, void *context);
+
+ void (*put_context)(void *cookie_netfs_data, void *context);
+
+ void (*mark_pages_cached)(void *cookie_netfs_data,
+ struct address_space *mapping,
+ struct pagevec *cached_pvec);
+
+ void (*now_uncached)(void *cookie_netfs_data);
+ };
+
+This has the following fields:
+
+ (1) The type of the object [mandatory].
+
+ This is one of the following values:
+
+ (*) FSCACHE_COOKIE_TYPE_INDEX
+
+ This defines an index, which is a special FS-Cache type.
+
+ (*) FSCACHE_COOKIE_TYPE_DATAFILE
+
+ This defines an ordinary data file.
+
+ (*) Any other value between 2 and 255
+
+ This defines an extraordinary object such as an XATTR.
+
+ (2) The name of the object type (NUL terminated unless all 16 chars are used)
+ [optional].
+
+ (3) A function to select the cache in which to store an index [optional].
+
+ This function is invoked when an index needs to be instantiated in a cache
+ during the instantiation of a non-index object. Only the immediate index
+ parent for the non-index object will be queried. Any indices above that
+ in the hierarchy may be stored in multiple caches. This function does not
+ need to be supplied for any non-index object or any index that will only
+ have index children.
+
+ If this function is not supplied or if it returns NULL then the first
+ cache in the parent's list will be chosen, or failing that, the first
+ cache in the master list.
+
+ (4) A function to retrieve an object's key from the netfs [mandatory].
+
+ This function will be called with the netfs data that was passed to the
+ cookie acquisition function and the maximum length of key data that it may
+ provide. It should write the required key data into the given buffer and
+ return the quantity it wrote.
+
+ (5) A function to retrieve attribute data from the netfs [optional].
+
+ This function will be called with the netfs data that was passed to the
+ cookie acquisition function. It should return the size of the file if
+ this is a data file. The size may be used to govern how much cache must
+ be reserved for this file in the cache.
+
+ If the function is absent, a file size of 0 is assumed.
+
+ (6) A function to retrieve auxiliary data from the netfs [optional].
+
+ This function will be called with the netfs data that was passed to the
+ cookie acquisition function and the maximum length of auxiliary data that
+ it may provide. It should write the auxiliary data into the given buffer
+ and return the quantity it wrote.
+
+ If this function is absent, the auxiliary data length will be set to 0.
+
+ The length of the auxiliary data buffer may be dependent on the key
+ length. A netfs mustn't rely on being able to provide more than 400 bytes
+ for both.
+
+ (7) A function to check the auxiliary data [optional].
+
+ This function will be called to check that a match found in the cache for
+ this object is valid. For instance with AFS it could check the auxiliary
+ data against the data version number returned by the server to determine
+ whether the index entry in a cache is still valid.
+
+ If this function is absent, it will be assumed that matching objects in a
+ cache are always valid.
+
+ If present, the function should return one of the following values:
+
+ (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is
+ (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update
+ (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted
+
+ This function can also be used to extract data from the auxiliary data in
+ the cache and copy it into the netfs's structures.
+
+ (8) A pair of functions to manage contexts for the completion callback
+ [optional].
+
+ The cache read/write functions are passed a context which is then passed
+ to the I/O completion callback function. To ensure this context remains
+ valid until after the I/O completion is called, two functions may be
+ provided: one to get an extra reference on the context, and one to drop a
+ reference to it.
+
+ If the context is not used or is a type of object that won't go out of
+ scope, then these functions are not required. These functions are not
+ required for indices as indices may not contain data. These functions may
+ be called in interrupt context and so may not sleep.
+
+ (9) A function to mark a page as retaining cache metadata [optional].
+
+ This is called by the cache to indicate that it is retaining in-memory
+ information for this page and that the netfs should uncache the page when
+ it has finished. This does not indicate whether there's data on the disk
+ or not. Note that several pages at once may be presented for marking.
+
+ The PG_fscache bit is set on the pages before this function would be
+ called, so the function need not be provided if this is sufficient.
+
+ This function is not required for indices as they're not permitted data.
+
+(10) A function to unmark all the pages retaining cache metadata [mandatory].
+
+ This is called by FS-Cache to indicate that a backing store is being
+ unbound from a cookie and that all the marks on the pages should be
+ cleared to prevent confusion. Note that the cache will have torn down all
+ its tracking information so that the pages don't need to be explicitly
+ uncached.
+
+ This function is not required for indices as they're not permitted data.
+
+
+===================================
+NETWORK FILESYSTEM (UN)REGISTRATION
+===================================
+
+The first step is to declare the network filesystem to the cache. This also
+involves specifying the layout of the primary index (for AFS, this would be the
+"cell" level).
+
+The registration function is:
+
+ int fscache_register_netfs(struct fscache_netfs *netfs);
+
+It just takes a pointer to the netfs definition. It returns 0 or an error as
+appropriate.
+
+For kAFS, registration is done as follows:
+
+ ret = fscache_register_netfs(&afs_cache_netfs);
+
+The last step is, of course, unregistration:
+
+ void fscache_unregister_netfs(struct fscache_netfs *netfs);
+
+
+================
+CACHE TAG LOOKUP
+================
+
+FS-Cache permits the use of more than one cache. To permit particular index
+subtrees to be bound to particular caches, the second step is to look up cache
+representation tags. This step is optional; it can be left entirely up to
+FS-Cache as to which cache should be used. The problem with doing that is that
+FS-Cache will always pick the first cache that was registered.
+
+To get the representation for a named tag:
+
+ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
+
+This takes a text string as the name and returns a representation of a tag. It
+will never return an error. It may return a dummy tag, however, if it runs out
+of memory; this will inhibit caching with this tag.
+
+Any representation so obtained must be released by passing it to this function:
+
+ void fscache_release_cache_tag(struct fscache_cache_tag *tag);
+
+The tag will be retrieved by FS-Cache when it calls the object definition
+operation select_cache().
+
+
+==================
+INDEX REGISTRATION
+==================
+
+The third step is to inform FS-Cache about part of an index hierarchy that can
+be used to locate files. This is done by requesting a cookie for each index in
+the path to the file:
+
+ struct fscache_cookie *
+ fscache_acquire_cookie(struct fscache_cookie *parent,
+ const struct fscache_object_def *def,
+ void *netfs_data);
+
+This function creates an index entry in the index represented by parent,
+filling in the index entry by calling the operations pointed to by def.
+
+Note that this function never returns an error - all errors are handled
+internally. It may, however, return NULL to indicate no cookie. It is quite
+acceptable to pass this token back to this function as the parent to another
+acquisition (or even to the relinquish cookie, read page and write page
+functions - see below).
+
+Note also that no indices are actually created in a cache until a non-index
+object needs to be created somewhere down the hierarchy. Furthermore, an index
+may be created in several different caches independently at different times.
+This is all handled transparently, and the netfs doesn't see any of it.
+
+For example, with AFS, a cell would be added to the primary index. This index
+entry would have a dependent inode containing a volume location index for the
+volume mappings within this cell:
+
+ cell->cache =
+ fscache_acquire_cookie(afs_cache_netfs.primary_index,
+ &afs_cell_cache_index_def,
+ cell);
+
+Then when a volume location was accessed, it would be entered into the cell's
+index and an inode would be allocated that acts as a volume type and hash chain
+combination:
+
+ vlocation->cache =
+ fscache_acquire_cookie(cell->cache,
+ &afs_vlocation_cache_index_def,
+ vlocation);
+
+And then a particular flavour of volume (R/O for example) could be added to
+that index, creating another index for vnodes (AFS inode equivalents):
+
+ volume->cache =
+ fscache_acquire_cookie(vlocation->cache,
+ &afs_volume_cache_index_def,
+ volume);
+
+
+======================
+DATA FILE REGISTRATION
+======================
+
+The fourth step is to request a data file be created in the cache. This is
+identical to index cookie acquisition. The only difference is that the type in
+the object definition should be something other than index type.
+
+ vnode->cache =
+ fscache_acquire_cookie(volume->cache,
+ &afs_vnode_cache_object_def,
+ vnode);
+
+
+=================================
+MISCELLANEOUS OBJECT REGISTRATION
+=================================
+
+An optional step is to request an object of miscellaneous type be created in
+the cache. This is almost identical to index cookie acquisition. The only
+difference is that the type in the object definition should be something other
+than index type. Whilst the parent object could be an index, it's more likely
+it would be some other type of object such as a data file.
+
+ xattr->cache =
+ fscache_acquire_cookie(vnode->cache,
+ &afs_xattr_cache_object_def,
+ xattr);
+
+Miscellaneous objects might be used to store extended attributes or directory
+entries for example.
+
+
+==========================
+SETTING THE DATA FILE SIZE
+==========================
+
+The fifth step is to set the physical attributes of the file, such as its size.
+This doesn't automatically reserve any space in the cache, but permits the
+cache to adjust its metadata for data tracking appropriately:
+
+ int fscache_attr_changed(struct fscache_cookie *cookie);
+
+The cache will return -ENOBUFS if there is no backing cache or if there is no
+space to allocate any extra metadata required in the cache. The attributes
+will be accessed with the get_attr() cookie definition operation.
+
+Note that attempts to read or write data pages in the cache over this size may
+be rebuffed with -ENOBUFS.
+
+This operation schedules an attribute adjustment to happen asynchronously at
+some point in the future, and as such, it may happen after the function returns
+to the caller. The attribute adjustment excludes read and write operations.
+
+
+=====================
+PAGE READ/ALLOC/WRITE
+=====================
+
+And the sixth step is to store and retrieve pages in the cache. There are
+three functions that are used to do this.
+
+Note:
+
+ (1) A page should not be re-read or re-allocated without uncaching it first.
+
+ (2) A read or allocated page must be uncached when the netfs page is released
+ from the pagecache.
+
+ (3) A page should only be written to the cache if previous read or allocated.
+
+This permits the cache to maintain its page tracking in proper order.
+
+
+PAGE READ
+---------
+
+Firstly, the netfs should ask FS-Cache to examine the caches and read the
+contents cached for a particular page of a particular file if present, or else
+allocate space to store the contents if not:
+
+ typedef
+ void (*fscache_rw_complete_t)(struct page *page,
+ void *context,
+ int error);
+
+ int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+ struct page *page,
+ fscache_rw_complete_t end_io_func,
+ void *context,
+ gfp_t gfp);
+
+The cookie argument must specify a cookie for an object that isn't an index,
+the page specified will have the data loaded into it (and is also used to
+specify the page number), and the gfp argument is used to control how any
+memory allocations made are satisfied.
+
+If the cookie indicates the inode is not cached:
+
+ (1) The function will return -ENOBUFS.
+
+Else if there's a copy of the page resident in the cache:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) The function will submit a request to read the data from the cache's
+ backing device directly into the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the read is complete, end_io_func() will be invoked with:
+
+ (*) The netfs data supplied when the cookie was created.
+
+ (*) The page descriptor.
+
+ (*) The context argument passed to the above function. This will be
+ maintained with the get_context/put_context functions mentioned above.
+
+ (*) An argument that's 0 on success or negative for an error code.
+
+ If an error occurs, it should be assumed that the page contains no usable
+ data.
+
+ end_io_func() will be called in process context if the read is results in
+ an error, but it might be called in interrupt context if the read is
+ successful.
+
+Otherwise, if there's not a copy available in cache, but the cache may be able
+to store the page:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) A block may be reserved in the cache and attached to the object at the
+ appropriate place.
+
+ (3) The function will return -ENODATA.
+
+This function may also return -ENOMEM or -EINTR, in which case it won't have
+read any data from the cache.
+
+
+PAGE ALLOCATE
+-------------
+
+Alternatively, if there's not expected to be any data in the cache for a page
+because the file has been extended, a block can simply be allocated instead:
+
+ int fscache_alloc_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp);
+
+This is similar to the fscache_read_or_alloc_page() function, except that it
+never reads from the cache. It will return 0 if a block has been allocated,
+rather than -ENODATA as the other would. One or the other must be performed
+before writing to the cache.
+
+The mark_pages_cached() cookie operation will be called on the page if
+successful.
+
+
+PAGE WRITE
+----------
+
+Secondly, if the netfs changes the contents of the page (either due to an
+initial download or if a user performs a write), then the page should be
+written back to the cache:
+
+ int fscache_write_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp);
+
+The cookie argument must specify a data file cookie, the page specified should
+contain the data to be written (and is also used to specify the page number),
+and the gfp argument is used to control how any memory allocations made are
+satisfied.
+
+The page must have first been read or allocated successfully and must not have
+been uncached before writing is performed.
+
+If the cookie indicates the inode is not cached then:
+
+ (1) The function will return -ENOBUFS.
+
+Else if space can be allocated in the cache to hold this page:
+
+ (1) PG_fscache_write will be set on the page.
+
+ (2) The function will submit a request to write the data to cache's backing
+ device directly from the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the write is complete PG_fscache_write is cleared on the page and
+ anyone waiting for that bit will be woken up.
+
+Else if there's no space available in the cache, -ENOBUFS will be returned. It
+is also possible for the PG_fscache_write bit to be cleared when no write took
+place if unforeseen circumstances arose (such as a disk error).
+
+Writing takes place asynchronously.
+
+
+MULTIPLE PAGE READ
+------------------
+
+A facility is provided to read several pages at once, as requested by the
+readpages() address space operation:
+
+ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+ struct address_space *mapping,
+ struct list_head *pages,
+ int *nr_pages,
+ fscache_rw_complete_t end_io_func,
+ void *context,
+ gfp_t gfp);
+
+This works in a similar way to fscache_read_or_alloc_page(), except:
+
+ (1) Any page it can retrieve data for is removed from pages and nr_pages and
+ dispatched for reading to the disk. Reads of adjacent pages on disk may
+ be merged for greater efficiency.
+
+ (2) The mark_pages_cached() cookie operation will be called on several pages
+ at once if they're being read or allocated.
+
+ (3) If there was an general error, then that error will be returned.
+
+ Else if some pages couldn't be allocated or read, then -ENOBUFS will be
+ returned.
+
+ Else if some pages couldn't be read but were allocated, then -ENODATA will
+ be returned.
+
+ Otherwise, if all pages had reads dispatched, then 0 will be returned, the
+ list will be empty and *nr_pages will be 0.
+
+ (4) end_io_func will be called once for each page being read as the reads
+ complete. It will be called in process context if error != 0, but it may
+ be called in interrupt context if there is no error.
+
+Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
+some of the pages being read and some being allocated. Those pages will have
+been marked appropriately and will need uncaching.
+
+
+==============
+PAGE UNCACHING
+==============
+
+To uncache a page, this function should be called:
+
+ void fscache_uncache_page(struct fscache_cookie *cookie,
+ struct page *page);
+
+This function permits the cache to release any in-memory representation it
+might be holding for this netfs page. This function must be called once for
+each page on which the read or write page functions above have been called to
+make sure the cache's in-memory tracking information gets torn down.
+
+Note that pages can't be explicitly deleted from the a data file. The whole
+data file must be retired (see the relinquish cookie function below).
+
+Furthermore, note that this does not cancel the asynchronous read or write
+operation started by the read/alloc and write functions, so the page
+invalidation functions must use:
+
+ bool fscache_check_page_write(struct fscache_cookie *cookie,
+ struct page *page);
+
+to see if a page is being written to the cache, and:
+
+ void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+ struct page *page);
+
+to wait for it to finish if it is.
+
+
+When releasepage() is being implemented, a special FS-Cache function exists to
+manage the heuristics of coping with vmscan trying to eject pages, which may
+conflict with the cache trying to write pages to the cache (which may itself
+need to allocate memory):
+
+ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp);
+
+This takes the netfs cookie, and the page and gfp arguments as supplied to
+releasepage(). It will return false if the page cannot be released yet for
+some reason and if it returns true, the page has been uncached and can now be
+released.
+
+To make a page available for release, this function may wait for an outstanding
+storage request to complete, or it may attempt to cancel the storage request -
+in which case the page will not be stored in the cache this time.
+
+
+BULK INODE PAGE UNCACHE
+-----------------------
+
+A convenience routine is provided to perform an uncache on all the pages
+attached to an inode. This assumes that the pages on the inode correspond on a
+1:1 basis with the pages in the cache.
+
+ void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
+ struct inode *inode);
+
+This takes the netfs cookie that the pages were cached with and the inode that
+the pages are attached to. This function will wait for pages to finish being
+written to the cache and for the cache to finish with the page generally. No
+error is returned.
+
+
+==========================
+INDEX AND DATA FILE UPDATE
+==========================
+
+To request an update of the index data for an index or other object, the
+following function should be called:
+
+ void fscache_update_cookie(struct fscache_cookie *cookie);
+
+This function will refer back to the netfs_data pointer stored in the cookie by
+the acquisition function to obtain the data to write into each revised index
+entry. The update method in the parent index definition will be called to
+transfer the data.
+
+Note that partial updates may happen automatically at other times, such as when
+data blocks are added to a data file object.
+
+
+===============================
+MISCELLANEOUS COOKIE OPERATIONS
+===============================
+
+There are a number of operations that can be used to control cookies:
+
+ (*) Cookie pinning:
+
+ int fscache_pin_cookie(struct fscache_cookie *cookie);
+ void fscache_unpin_cookie(struct fscache_cookie *cookie);
+
+ These operations permit data cookies to be pinned into the cache and to
+ have the pinning removed. They are not permitted on index cookies.
+
+ The pinning function will return 0 if successful, -ENOBUFS in the cookie
+ isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
+ -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+ -EIO if there's any other problem.
+
+ (*) Data space reservation:
+
+ int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
+
+ This permits a netfs to request cache space be reserved to store up to the
+ given amount of a file. It is permitted to ask for more than the current
+ size of the file to allow for future file expansion.
+
+ If size is given as zero then the reservation will be cancelled.
+
+ The function will return 0 if successful, -ENOBUFS in the cookie isn't
+ backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
+ -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+ -EIO if there's any other problem.
+
+ Note that this doesn't pin an object in a cache; it can still be culled to
+ make space if it's not in use.
+
+
+=====================
+COOKIE UNREGISTRATION
+=====================
+
+To get rid of a cookie, this function should be called.
+
+ void fscache_relinquish_cookie(struct fscache_cookie *cookie,
+ int retire);
+
+If retire is non-zero, then the object will be marked for recycling, and all
+copies of it will be removed from all active caches in which it is present.
+Not only that but all child objects will also be retired.
+
+If retire is zero, then the object may be available again when next the
+acquisition function is called. Retirement here will overrule the pinning on a
+cookie.
+
+One very important note - relinquish must NOT be called for a cookie unless all
+the cookies for "child" indices, objects and pages have been relinquished
+first.
+
+
+================================
+INDEX AND DATA FILE INVALIDATION
+================================
+
+There is no direct way to invalidate an index subtree or a data file. To do
+this, the caller should relinquish and retire the cookie they have, and then
+acquire a new one.
+
+
+===========================
+FS-CACHE SPECIFIC PAGE FLAG
+===========================
+
+FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is
+given the alternative name PG_fscache.
+
+PG_fscache is used to indicate that the page is known by the cache, and that
+the cache must be informed if the page is going to go away. It's an indication
+to the netfs that the cache has an interest in this page, where an interest may
+be a pointer to it, resources allocated or reserved for it, or I/O in progress
+upon it.
+
+The netfs can use this information in methods such as releasepage() to
+determine whether it needs to uncache a page or update it.
+
+Furthermore, if this bit is set, releasepage() and invalidatepage() operations
+will be called on a page to get rid of it, even if PG_private is not set. This
+allows caching to attempted on a page before read_cache_pages() to be called
+after fscache_read_or_alloc_pages() as the former will try and release pages it
+was given under certain circumstances.
+
+This bit does not overlap with such as PG_private. This means that FS-Cache
+can be used with a filesystem that uses the block buffering code.
+
+There are a number of operations defined on this flag:
+
+ int PageFsCache(struct page *page);
+ void SetPageFsCache(struct page *page)
+ void ClearPageFsCache(struct page *page)
+ int TestSetPageFsCache(struct page *page)
+ int TestClearPageFsCache(struct page *page)
+
+These functions are bit test, bit set, bit clear, bit test and set and bit
+test and clear operations on PG_fscache.
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt
new file mode 100644
index 00000000000..58313348da8
--- /dev/null
+++ b/Documentation/filesystems/caching/object.txt
@@ -0,0 +1,313 @@
+ ====================================================
+ IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT
+ ====================================================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Representation
+
+ (*) Object management state machine.
+
+ - Provision of cpu time.
+ - Locking simplification.
+
+ (*) The set of states.
+
+ (*) The set of events.
+
+
+==============
+REPRESENTATION
+==============
+
+FS-Cache maintains an in-kernel representation of each object that a netfs is
+currently interested in. Such objects are represented by the fscache_cookie
+struct and are referred to as cookies.
+
+FS-Cache also maintains a separate in-kernel representation of the objects that
+a cache backend is currently actively caching. Such objects are represented by
+the fscache_object struct. The cache backends allocate these upon request, and
+are expected to embed them in their own representations. These are referred to
+as objects.
+
+There is a 1:N relationship between cookies and objects. A cookie may be
+represented by multiple objects - an index may exist in more than one cache -
+or even by no objects (it may not be cached).
+
+Furthermore, both cookies and objects are hierarchical. The two hierarchies
+correspond, but the cookies tree is a superset of the union of the object trees
+of multiple caches:
+
+ NETFS INDEX TREE : CACHE 1 : CACHE 2
+ : :
+ : +-----------+ :
+ +----------->| IObject | :
+ +-----------+ | : +-----------+ :
+ | ICookie |-------+ : | :
+ +-----------+ | : | : +-----------+
+ | +------------------------------>| IObject |
+ | : | : +-----------+
+ | : V : |
+ | : +-----------+ : |
+ V +----------->| IObject | : |
+ +-----------+ | : +-----------+ : |
+ | ICookie |-------+ : | : V
+ +-----------+ | : | : +-----------+
+ | +------------------------------>| IObject |
+ +-----+-----+ : | : +-----------+
+ | | : | : |
+ V | : V : |
+ +-----------+ | : +-----------+ : |
+ | ICookie |------------------------->| IObject | : |
+ +-----------+ | : +-----------+ : |
+ | V : | : V
+ | +-----------+ : | : +-----------+
+ | | ICookie |-------------------------------->| IObject |
+ | +-----------+ : | : +-----------+
+ V | : V : |
+ +-----------+ | : +-----------+ : |
+ | DCookie |------------------------->| DObject | : |
+ +-----------+ | : +-----------+ : |
+ | : : |
+ +-------+-------+ : : |
+ | | : : |
+ V V : : V
+ +-----------+ +-----------+ : : +-----------+
+ | DCookie | | DCookie |------------------------>| DObject |
+ +-----------+ +-----------+ : : +-----------+
+ : :
+
+In the above illustration, ICookie and IObject represent indices and DCookie
+and DObject represent data storage objects. Indices may have representation in
+multiple caches, but currently, non-index objects may not. Objects of any type
+may also be entirely unrepresented.
+
+As far as the netfs API goes, the netfs is only actually permitted to see
+pointers to the cookies. The cookies themselves and any objects attached to
+those cookies are hidden from it.
+
+
+===============================
+OBJECT MANAGEMENT STATE MACHINE
+===============================
+
+Within FS-Cache, each active object is managed by its own individual state
+machine. The state for an object is kept in the fscache_object struct, in
+object->state. A cookie may point to a set of objects that are in different
+states.
+
+Each state has an action associated with it that is invoked when the machine
+wakes up in that state. There are four logical sets of states:
+
+ (1) Preparation: states that wait for the parent objects to become ready. The
+ representations are hierarchical, and it is expected that an object must
+ be created or accessed with respect to its parent object.
+
+ (2) Initialisation: states that perform lookups in the cache and validate
+ what's found and that create on disk any missing metadata.
+
+ (3) Normal running: states that allow netfs operations on objects to proceed
+ and that update the state of objects.
+
+ (4) Termination: states that detach objects from their netfs cookies, that
+ delete objects from disk, that handle disk and system errors and that free
+ up in-memory resources.
+
+
+In most cases, transitioning between states is in response to signalled events.
+When a state has finished processing, it will usually set the mask of events in
+which it is interested (object->event_mask) and relinquish the worker thread.
+Then when an event is raised (by calling fscache_raise_event()), if the event
+is not masked, the object will be queued for processing (by calling
+fscache_enqueue_object()).
+
+
+PROVISION OF CPU TIME
+---------------------
+
+The work to be done by the various states was given CPU time by the threads of
+the slow work facility. This was used in preference to the workqueue facility
+because:
+
+ (1) Threads may be completely occupied for very long periods of time by a
+ particular work item. These state actions may be doing sequences of
+ synchronous, journalled disk accesses (lookup, mkdir, create, setxattr,
+ getxattr, truncate, unlink, rmdir, rename).
+
+ (2) Threads may do little actual work, but may rather spend a lot of time
+ sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded
+ workqueues don't necessarily have the right numbers of threads.
+
+
+LOCKING SIMPLIFICATION
+----------------------
+
+Because only one worker thread may be operating on any particular object's
+state machine at once, this simplifies the locking, particularly with respect
+to disconnecting the netfs's representation of a cache object (fscache_cookie)
+from the cache backend's representation (fscache_object) - which may be
+requested from either end.
+
+
+=================
+THE SET OF STATES
+=================
+
+The object state machine has a set of states that it can be in. There are
+preparation states in which the object sets itself up and waits for its parent
+object to transit to a state that allows access to its children:
+
+ (1) State FSCACHE_OBJECT_INIT.
+
+ Initialise the object and wait for the parent object to become active. In
+ the cache, it is expected that it will not be possible to look an object
+ up from the parent object, until that parent object itself has been looked
+ up.
+
+There are initialisation states in which the object sets itself up and accesses
+disk for the object metadata:
+
+ (2) State FSCACHE_OBJECT_LOOKING_UP.
+
+ Look up the object on disk, using the parent as a starting point.
+ FS-Cache expects the cache backend to probe the cache to see whether this
+ object is represented there, and if it is, to see if it's valid (coherency
+ management).
+
+ The cache should call fscache_object_lookup_negative() to indicate lookup
+ failure for whatever reason, and should call fscache_obtained_object() to
+ indicate success.
+
+ At the completion of lookup, FS-Cache will let the netfs go ahead with
+ read operations, no matter whether the file is yet cached. If not yet
+ cached, read operations will be immediately rejected with ENODATA until
+ the first known page is uncached - as to that point there can be no data
+ to be read out of the cache for that file that isn't currently also held
+ in the pagecache.
+
+ (3) State FSCACHE_OBJECT_CREATING.
+
+ Create an object on disk, using the parent as a starting point. This
+ happens if the lookup failed to find the object, or if the object's
+ coherency data indicated what's on disk is out of date. In this state,
+ FS-Cache expects the cache to create
+
+ The cache should call fscache_obtained_object() if creation completes
+ successfully, fscache_object_lookup_negative() otherwise.
+
+ At the completion of creation, FS-Cache will start processing write
+ operations the netfs has queued for an object. If creation failed, the
+ write ops will be transparently discarded, and nothing recorded in the
+ cache.
+
+There are some normal running states in which the object spends its time
+servicing netfs requests:
+
+ (4) State FSCACHE_OBJECT_AVAILABLE.
+
+ A transient state in which pending operations are started, child objects
+ are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary
+ lookup data is freed.
+
+ (5) State FSCACHE_OBJECT_ACTIVE.
+
+ The normal running state. In this state, requests the netfs makes will be
+ passed on to the cache.
+
+ (6) State FSCACHE_OBJECT_UPDATING.
+
+ The state machine comes here to update the object in the cache from the
+ netfs's records. This involves updating the auxiliary data that is used
+ to maintain coherency.
+
+And there are terminal states in which an object cleans itself up, deallocates
+memory and potentially deletes stuff from disk:
+
+ (7) State FSCACHE_OBJECT_LC_DYING.
+
+ The object comes here if it is dying because of a lookup or creation
+ error. This would be due to a disk error or system error of some sort.
+ Temporary data is cleaned up, and the parent is released.
+
+ (8) State FSCACHE_OBJECT_DYING.
+
+ The object comes here if it is dying due to an error, because its parent
+ cookie has been relinquished by the netfs or because the cache is being
+ withdrawn.
+
+ Any child objects waiting on this one are given CPU time so that they too
+ can destroy themselves. This object waits for all its children to go away
+ before advancing to the next state.
+
+ (9) State FSCACHE_OBJECT_ABORT_INIT.
+
+ The object comes to this state if it was waiting on its parent in
+ FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself
+ so that the parent may proceed from the FSCACHE_OBJECT_DYING state.
+
+(10) State FSCACHE_OBJECT_RELEASING.
+(11) State FSCACHE_OBJECT_RECYCLING.
+
+ The object comes to one of these two states when dying once it is rid of
+ all its children, if it is dying because the netfs relinquished its
+ cookie. In the first state, the cached data is expected to persist, and
+ in the second it will be deleted.
+
+(12) State FSCACHE_OBJECT_WITHDRAWING.
+
+ The object transits to this state if the cache decides it wants to
+ withdraw the object from service, perhaps to make space, but also due to
+ error or just because the whole cache is being withdrawn.
+
+(13) State FSCACHE_OBJECT_DEAD.
+
+ The object transits to this state when the in-memory object record is
+ ready to be deleted. The object processor shouldn't ever see an object in
+ this state.
+
+
+THE SET OF EVENTS
+-----------------
+
+There are a number of events that can be raised to an object state machine:
+
+ (*) FSCACHE_OBJECT_EV_UPDATE
+
+ The netfs requested that an object be updated. The state machine will ask
+ the cache backend to update the object, and the cache backend will ask the
+ netfs for details of the change through its cookie definition ops.
+
+ (*) FSCACHE_OBJECT_EV_CLEARED
+
+ This is signalled in two circumstances:
+
+ (a) when an object's last child object is dropped and
+
+ (b) when the last operation outstanding on an object is completed.
+
+ This is used to proceed from the dying state.
+
+ (*) FSCACHE_OBJECT_EV_ERROR
+
+ This is signalled when an I/O error occurs during the processing of some
+ object.
+
+ (*) FSCACHE_OBJECT_EV_RELEASE
+ (*) FSCACHE_OBJECT_EV_RETIRE
+
+ These are signalled when the netfs relinquishes a cookie it was using.
+ The event selected depends on whether the netfs asks for the backing
+ object to be retired (deleted) or retained.
+
+ (*) FSCACHE_OBJECT_EV_WITHDRAW
+
+ This is signalled when the cache backend wants to withdraw an object.
+ This means that the object will have to be detached from the netfs's
+ cookie.
+
+Because the withdrawing releasing/retiring events are all handled by the object
+state machine, it doesn't matter if there's a collision with both ends trying
+to sever the connection at the same time. The state machine can just pick
+which one it wants to honour, and that effects the other.
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
new file mode 100644
index 00000000000..b6b070c57cb
--- /dev/null
+++ b/Documentation/filesystems/caching/operations.txt
@@ -0,0 +1,213 @@
+ ================================
+ ASYNCHRONOUS OPERATIONS HANDLING
+ ================================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Overview.
+
+ (*) Operation record initialisation.
+
+ (*) Parameters.
+
+ (*) Procedure.
+
+ (*) Asynchronous callback.
+
+
+========
+OVERVIEW
+========
+
+FS-Cache has an asynchronous operations handling facility that it uses for its
+data storage and retrieval routines. Its operations are represented by
+fscache_operation structs, though these are usually embedded into some other
+structure.
+
+This facility is available to and expected to be be used by the cache backends,
+and FS-Cache will create operations and pass them off to the appropriate cache
+backend for completion.
+
+To make use of this facility, <linux/fscache-cache.h> should be #included.
+
+
+===============================
+OPERATION RECORD INITIALISATION
+===============================
+
+An operation is recorded in an fscache_operation struct:
+
+ struct fscache_operation {
+ union {
+ struct work_struct fast_work;
+ struct slow_work slow_work;
+ };
+ unsigned long flags;
+ fscache_operation_processor_t processor;
+ ...
+ };
+
+Someone wanting to issue an operation should allocate something with this
+struct embedded in it. They should initialise it by calling:
+
+ void fscache_operation_init(struct fscache_operation *op,
+ fscache_operation_release_t release);
+
+with the operation to be initialised and the release function to use.
+
+The op->flags parameter should be set to indicate the CPU time provision and
+the exclusivity (see the Parameters section).
+
+The op->fast_work, op->slow_work and op->processor flags should be set as
+appropriate for the CPU time provision (see the Parameters section).
+
+FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the
+operation and waited for afterwards.
+
+
+==========
+PARAMETERS
+==========
+
+There are a number of parameters that can be set in the operation record's flag
+parameter. There are three options for the provision of CPU time in these
+operations:
+
+ (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread
+ may decide it wants to handle an operation itself without deferring it to
+ another thread.
+
+ This is, for example, used in read operations for calling readpages() on
+ the backing filesystem in CacheFiles. Although readpages() does an
+ asynchronous data fetch, the determination of whether pages exist is done
+ synchronously - and the netfs does not proceed until this has been
+ determined.
+
+ If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags
+ before submitting the operation, and the operating thread must wait for it
+ to be cleared before proceeding:
+
+ wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+
+ (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it
+ will be given to keventd to process. Such an operation is not permitted
+ to sleep on I/O.
+
+ This is, for example, used by CacheFiles to copy data from a backing fs
+ page to a netfs page after the backing fs has read the page in.
+
+ If this option is used, op->fast_work and op->processor must be
+ initialised before submitting the operation:
+
+ INIT_WORK(&op->fast_work, do_some_work);
+
+
+ (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it
+ will be given to the slow work facility to process. Such an operation is
+ permitted to sleep on I/O.
+
+ This is, for example, used by FS-Cache to handle background writes of
+ pages that have just been fetched from a remote server.
+
+ If this option is used, op->slow_work and op->processor must be
+ initialised before submitting the operation:
+
+ fscache_operation_init_slow(op, processor)
+
+
+Furthermore, operations may be one of two types:
+
+ (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in
+ conjunction with any other operation on the object being operated upon.
+
+ An example of this is the attribute change operation, in which the file
+ being written to may need truncation.
+
+ (2) Shareable. Operations of this type may be running simultaneously. It's
+ up to the operation implementation to prevent interference between other
+ operations running at the same time.
+
+
+=========
+PROCEDURE
+=========
+
+Operations are used through the following procedure:
+
+ (1) The submitting thread must allocate the operation and initialise it
+ itself. Normally this would be part of a more specific structure with the
+ generic op embedded within.
+
+ (2) The submitting thread must then submit the operation for processing using
+ one of the following two functions:
+
+ int fscache_submit_op(struct fscache_object *object,
+ struct fscache_operation *op);
+
+ int fscache_submit_exclusive_op(struct fscache_object *object,
+ struct fscache_operation *op);
+
+ The first function should be used to submit non-exclusive ops and the
+ second to submit exclusive ones. The caller must still set the
+ FSCACHE_OP_EXCLUSIVE flag.
+
+ If successful, both functions will assign the operation to the specified
+ object and return 0. -ENOBUFS will be returned if the object specified is
+ permanently unavailable.
+
+ The operation manager will defer operations on an object that is still
+ undergoing lookup or creation. The operation will also be deferred if an
+ operation of conflicting exclusivity is in progress on the object.
+
+ If the operation is asynchronous, the manager will retain a reference to
+ it, so the caller should put their reference to it by passing it to:
+
+ void fscache_put_operation(struct fscache_operation *op);
+
+ (3) If the submitting thread wants to do the work itself, and has marked the
+ operation with FSCACHE_OP_MYTHREAD, then it should monitor
+ FSCACHE_OP_WAITING as described above and check the state of the object if
+ necessary (the object might have died whilst the thread was waiting).
+
+ When it has finished doing its processing, it should call
+ fscache_put_operation() on it.
+
+ (4) The operation holds an effective lock upon the object, preventing other
+ exclusive ops conflicting until it is released. The operation can be
+ enqueued for further immediate asynchronous processing by adjusting the
+ CPU time provisioning option if necessary, eg:
+
+ op->flags &= ~FSCACHE_OP_TYPE;
+ op->flags |= ~FSCACHE_OP_FAST;
+
+ and calling:
+
+ void fscache_enqueue_operation(struct fscache_operation *op)
+
+ This can be used to allow other things to have use of the worker thread
+ pools.
+
+
+=====================
+ASYNCHRONOUS CALLBACK
+=====================
+
+When used in asynchronous mode, the worker thread pool will invoke the
+processor method with a pointer to the operation. This should then get at the
+container struct by using container_of():
+
+ static void fscache_write_op(struct fscache_operation *_op)
+ {
+ struct fscache_storage *op =
+ container_of(_op, struct fscache_storage, op);
+ ...
+ }
+
+The caller holds a reference on the operation, and will invoke
+fscache_put_operation() when the processor function returns. The processor
+function is at liberty to call fscache_enqueue_operation() or to take extra
+references.
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 00000000000..d6030aa3337
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,148 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability. No single point of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre. Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons. Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.). File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs. When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures. The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads. In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation. The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system. Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes. That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes. This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects. (However, if the monitor you specify
+happens to be down, the mount won't succeed.) The port can be left
+off if the monitor is using the default. So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient. If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+ ip=A.B.C.D[:N]
+ Specify the IP and/or port the client should bind to locally.
+ There is normally not much reason to do this. If the IP is not
+ specified, the client's IP address is determined by looking at the
+ address its connection to the monitor originates from.
+
+ wsize=X
+ Specify the maximum write size in bytes. By default there is no
+ maximum. Ceph will normally size writes based on the file stripe
+ size.
+
+ rsize=X
+ Specify the maximum readahead.
+
+ mount_timeout=X
+ Specify the timeout value for mount (in seconds), in the case
+ of a non-responsive Ceph file system. The default is 30
+ seconds.
+
+ rbytes
+ When stat() is called on a directory, set st_size to 'rbytes',
+ the summation of file sizes over all files nested beneath that
+ directory. This is the default.
+
+ norbytes
+ When stat() is called on a directory, set st_size to the
+ number of entries in that directory.
+
+ nocrc
+ Disable CRC32C calculation for data writes. If set, the storage node
+ must rely on TCP's error correction to detect data corruption
+ in the data payload.
+
+ dcache
+ Use the dcache contents to perform negative lookups and
+ readdir when the client has the entire directory contents in
+ its cache. (This does not change correctness; the client uses
+ cached metadata only when a lease or capability ensures it is
+ valid.)
+
+ nodcache
+ Do not use the dcache as above. This avoids a significant amount of
+ complex code, sacrificing performance without affecting correctness,
+ and is useful for tracking down bugs.
+
+ noasyncreaddir
+ Do not use the dcache as above for readdir.
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+ http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+ git://ceph.newdream.net/git/ceph-client.git
+ git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+
+and the source for the full system is at
+ git://ceph.newdream.net/git/ceph.git
diff --git a/Documentation/filesystems/cifs.txt b/Documentation/filesystems/cifs.txt
new file mode 100644
index 00000000000..49cc923a93e
--- /dev/null
+++ b/Documentation/filesystems/cifs.txt
@@ -0,0 +1,51 @@
+ This is the client VFS module for the Common Internet File System
+ (CIFS) protocol which is the successor to the Server Message Block
+ (SMB) protocol, the native file sharing mechanism for most early
+ PC operating systems. CIFS is fully supported by current network
+ file servers such as Windows 2000, Windows 2003 (including
+ Windows XP) as well by Samba (which provides excellent CIFS
+ server support for Linux and many other operating systems), so
+ this network filesystem client can mount to a wide variety of
+ servers. The smbfs module should be used instead of this cifs module
+ for mounting to older SMB servers such as OS/2. The smbfs and cifs
+ modules can coexist and do not conflict. The CIFS VFS filesystem
+ module is designed to work well with servers that implement the
+ newer versions (dialects) of the SMB/CIFS protocol such as Samba,
+ the program written by Andrew Tridgell that turns any Unix host
+ into a SMB/CIFS file server.
+
+ The intent of this module is to provide the most advanced network
+ file system function for CIFS compliant servers, including better
+ POSIX compliance, secure per-user session establishment, high
+ performance safe distributed caching (oplock), optional packet
+ signing, large files, Unicode support and other internationalization
+ improvements. Since both Samba server and this filesystem client support
+ the CIFS Unix extensions, the combination can provide a reasonable
+ alternative to NFSv4 for fileserving in some Linux to Linux environments,
+ not just in Linux to Windows environments.
+
+ This filesystem has an optional mount utility (mount.cifs) that can
+ be obtained from the project page and installed in the path in the same
+ directory with the other mount helpers (such as mount.smbfs).
+ Mounting using the cifs filesystem without installing the mount helper
+ requires specifying the server's ip address.
+
+ For Linux 2.4:
+ mount //anything/here /mnt_target -o
+ user=username,pass=password,unc=//ip_address_of_server/sharename
+
+ For Linux 2.5:
+ mount //ip_address_of_server/sharename /mnt_target -o user=username, pass=password
+
+
+ For more information on the module see the project page at
+
+ http://us1.samba.org/samba/Linux_CIFS_client.html
+
+ For more information on CIFS see:
+
+ http://www.snia.org/tech_activities/CIFS
+
+ or the Samba site:
+
+ http://www.samba.org
diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
new file mode 100644
index 00000000000..61311356025
--- /dev/null
+++ b/Documentation/filesystems/coda.txt
@@ -0,0 +1,1673 @@
+NOTE:
+This is one of the technical documents describing a component of
+Coda -- this document describes the client kernel-Venus interface.
+
+For more information:
+ http://www.coda.cs.cmu.edu
+For user level software needed to run Coda:
+ ftp://ftp.coda.cs.cmu.edu
+
+To run Coda you need to get a user level cache manager for the client,
+named Venus, as well as tools to manipulate ACLs, to log in, etc. The
+client needs to have the Coda filesystem selected in the kernel
+configuration.
+
+The server needs a user level server and at present does not depend on
+kernel support.
+
+
+
+
+
+
+
+ The Venus kernel interface
+ Peter J. Braam
+ v1.0, Nov 9, 1997
+
+ This document describes the communication between Venus and kernel
+ level filesystem code needed for the operation of the Coda file sys-
+ tem. This document version is meant to describe the current interface
+ (version 1.0) as well as improvements we envisage.
+ ______________________________________________________________________
+
+ Table of Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1. Introduction
+
+ 2. Servicing Coda filesystem calls
+
+ 3. The message layer
+
+ 3.1 Implementation details
+
+ 4. The interface at the call level
+
+ 4.1 Data structures shared by the kernel and Venus
+ 4.2 The pioctl interface
+ 4.3 root
+ 4.4 lookup
+ 4.5 getattr
+ 4.6 setattr
+ 4.7 access
+ 4.8 create
+ 4.9 mkdir
+ 4.10 link
+ 4.11 symlink
+ 4.12 remove
+ 4.13 rmdir
+ 4.14 readlink
+ 4.15 open
+ 4.16 close
+ 4.17 ioctl
+ 4.18 rename
+ 4.19 readdir
+ 4.20 vget
+ 4.21 fsync
+ 4.22 inactive
+ 4.23 rdwr
+ 4.24 odymount
+ 4.25 ody_lookup
+ 4.26 ody_expand
+ 4.27 prefetch
+ 4.28 signal
+
+ 5. The minicache and downcalls
+
+ 5.1 INVALIDATE
+ 5.2 FLUSH
+ 5.3 PURGEUSER
+ 5.4 ZAPFILE
+ 5.5 ZAPDIR
+ 5.6 ZAPVNODE
+ 5.7 PURGEFID
+ 5.8 REPLACE
+
+ 6. Initialization and cleanup
+
+ 6.1 Requirements
+
+
+ ______________________________________________________________________
+ 0wpage
+
+ 11.. IInnttrroodduuccttiioonn
+
+
+
+ A key component in the Coda Distributed File System is the cache
+ manager, _V_e_n_u_s.
+
+
+ When processes on a Coda enabled system access files in the Coda
+ filesystem, requests are directed at the filesystem layer in the
+ operating system. The operating system will communicate with Venus to
+ service the request for the process. Venus manages a persistent
+ client cache and makes remote procedure calls to Coda file servers and
+ related servers (such as authentication servers) to service these
+ requests it receives from the operating system. When Venus has
+ serviced a request it replies to the operating system with appropriate
+ return codes, and other data related to the request. Optionally the
+ kernel support for Coda may maintain a minicache of recently processed
+ requests to limit the number of interactions with Venus. Venus
+ possesses the facility to inform the kernel when elements from its
+ minicache are no longer valid.
+
+ This document describes precisely this communication between the
+ kernel and Venus. The definitions of so called upcalls and downcalls
+ will be given with the format of the data they handle. We shall also
+ describe the semantic invariants resulting from the calls.
+
+ Historically Coda was implemented in a BSD file system in Mach 2.6.
+ The interface between the kernel and Venus is very similar to the BSD
+ VFS interface. Similar functionality is provided, and the format of
+ the parameters and returned data is very similar to the BSD VFS. This
+ leads to an almost natural environment for implementing a kernel-level
+ filesystem driver for Coda in a BSD system. However, other operating
+ systems such as Linux and Windows 95 and NT have virtual filesystem
+ with different interfaces.
+
+ To implement Coda on these systems some reverse engineering of the
+ Venus/Kernel protocol is necessary. Also it came to light that other
+ systems could profit significantly from certain small optimizations
+ and modifications to the protocol. To facilitate this work as well as
+ to make future ports easier, communication between Venus and the
+ kernel should be documented in great detail. This is the aim of this
+ document.
+
+ 0wpage
+
+ 22.. SSeerrvviicciinngg CCooddaa ffiilleessyysstteemm ccaallllss
+
+ The service of a request for a Coda file system service originates in
+ a process PP which accessing a Coda file. It makes a system call which
+ traps to the OS kernel. Examples of such calls trapping to the kernel
+ are _r_e_a_d_, _w_r_i_t_e_, _o_p_e_n_, _c_l_o_s_e_, _c_r_e_a_t_e_, _m_k_d_i_r_, _r_m_d_i_r_, _c_h_m_o_d in a Unix
+ context. Similar calls exist in the Win32 environment, and are named
+ _C_r_e_a_t_e_F_i_l_e_, .
+
+ Generally the operating system handles the request in a virtual
+ filesystem (VFS) layer, which is named I/O Manager in NT and IFS
+ manager in Windows 95. The VFS is responsible for partial processing
+ of the request and for locating the specific filesystem(s) which will
+ service parts of the request. Usually the information in the path
+ assists in locating the correct FS drivers. Sometimes after extensive
+ pre-processing, the VFS starts invoking exported routines in the FS
+ driver. This is the point where the FS specific processing of the
+ request starts, and here the Coda specific kernel code comes into
+ play.
+
+ The FS layer for Coda must expose and implement several interfaces.
+ First and foremost the VFS must be able to make all necessary calls to
+ the Coda FS layer, so the Coda FS driver must expose the VFS interface
+ as applicable in the operating system. These differ very significantly
+ among operating systems, but share features such as facilities to
+ read/write and create and remove objects. The Coda FS layer services
+ such VFS requests by invoking one or more well defined services
+ offered by the cache manager Venus. When the replies from Venus have
+ come back to the FS driver, servicing of the VFS call continues and
+ finishes with a reply to the kernel's VFS. Finally the VFS layer
+ returns to the process.
+
+ As a result of this design a basic interface exposed by the FS driver
+ must allow Venus to manage message traffic. In particular Venus must
+ be able to retrieve and place messages and to be notified of the
+ arrival of a new message. The notification must be through a mechanism
+ which does not block Venus since Venus must attend to other tasks even
+ when no messages are waiting or being processed.
+
+
+
+
+
+
+ Interfaces of the Coda FS Driver
+
+ Furthermore the FS layer provides for a special path of communication
+ between a user process and Venus, called the pioctl interface. The
+ pioctl interface is used for Coda specific services, such as
+ requesting detailed information about the persistent cache managed by
+ Venus. Here the involvement of the kernel is minimal. It identifies
+ the calling process and passes the information on to Venus. When
+ Venus replies the response is passed back to the caller in unmodified
+ form.
+
+ Finally Venus allows the kernel FS driver to cache the results from
+ certain services. This is done to avoid excessive context switches
+ and results in an efficient system. However, Venus may acquire
+ information, for example from the network which implies that cached
+ information must be flushed or replaced. Venus then makes a downcall
+ to the Coda FS layer to request flushes or updates in the cache. The
+ kernel FS driver handles such requests synchronously.
+
+ Among these interfaces the VFS interface and the facility to place,
+ receive and be notified of messages are platform specific. We will
+ not go into the calls exported to the VFS layer but we will state the
+ requirements of the message exchange mechanism.
+
+ 0wpage
+
+ 33.. TThhee mmeessssaaggee llaayyeerr
+
+
+
+ At the lowest level the communication between Venus and the FS driver
+ proceeds through messages. The synchronization between processes
+ requesting Coda file service and Venus relies on blocking and waking
+ up processes. The Coda FS driver processes VFS- and pioctl-requests
+ on behalf of a process P, creates messages for Venus, awaits replies
+ and finally returns to the caller. The implementation of the exchange
+ of messages is platform specific, but the semantics have (so far)
+ appeared to be generally applicable. Data buffers are created by the
+ FS Driver in kernel memory on behalf of P and copied to user memory in
+ Venus.
+
+ The FS Driver while servicing P makes upcalls to Venus. Such an
+ upcall is dispatched to Venus by creating a message structure. The
+ structure contains the identification of P, the message sequence
+ number, the size of the request and a pointer to the data in kernel
+ memory for the request. Since the data buffer is re-used to hold the
+ reply from Venus, there is a field for the size of the reply. A flags
+ field is used in the message to precisely record the status of the
+ message. Additional platform dependent structures involve pointers to
+ determine the position of the message on queues and pointers to
+ synchronization objects. In the upcall routine the message structure
+ is filled in, flags are set to 0, and it is placed on the _p_e_n_d_i_n_g
+ queue. The routine calling upcall is responsible for allocating the
+ data buffer; its structure will be described in the next section.
+
+ A facility must exist to notify Venus that the message has been
+ created, and implemented using available synchronization objects in
+ the OS. This notification is done in the upcall context of the process
+ P. When the message is on the pending queue, process P cannot proceed
+ in upcall. The (kernel mode) processing of P in the filesystem
+ request routine must be suspended until Venus has replied. Therefore
+ the calling thread in P is blocked in upcall. A pointer in the
+ message structure will locate the synchronization object on which P is
+ sleeping.
+
+ Venus detects the notification that a message has arrived, and the FS
+ driver allow Venus to retrieve the message with a getmsg_from_kernel
+ call. This action finishes in the kernel by putting the message on the
+ queue of processing messages and setting flags to READ. Venus is
+ passed the contents of the data buffer. The getmsg_from_kernel call
+ now returns and Venus processes the request.
+
+ At some later point the FS driver receives a message from Venus,
+ namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS
+ driver looks at the contents of the message and decides if:
+
+
+ +o the message is a reply for a suspended thread P. If so it removes
+ the message from the processing queue and marks the message as
+ WRITTEN. Finally, the FS driver unblocks P (still in the kernel
+ mode context of Venus) and the sendmsg_to_kernel call returns to
+ Venus. The process P will be scheduled at some point and continues
+ processing its upcall with the data buffer replaced with the reply
+ from Venus.
+
+ +o The message is a _d_o_w_n_c_a_l_l. A downcall is a request from Venus to
+ the FS Driver. The FS driver processes the request immediately
+ (usually a cache eviction or replacement) and when it finishes
+ sendmsg_to_kernel returns.
+
+ Now P awakes and continues processing upcall. There are some
+ subtleties to take account of. First P will determine if it was woken
+ up in upcall by a signal from some other source (for example an
+ attempt to terminate P) or as is normally the case by Venus in its
+ sendmsg_to_kernel call. In the normal case, the upcall routine will
+ deallocate the message structure and return. The FS routine can proceed
+ with its processing.
+
+
+
+
+
+
+
+ Sleeping and IPC arrangements
+
+ In case P is woken up by a signal and not by Venus, it will first look
+ at the flags field. If the message is not yet READ, the process P can
+ handle its signal without notifying Venus. If Venus has READ, and
+ the request should not be processed, P can send Venus a signal message
+ to indicate that it should disregard the previous message. Such
+ signals are put in the queue at the head, and read first by Venus. If
+ the message is already marked as WRITTEN it is too late to stop the
+ processing. The VFS routine will now continue. (-- If a VFS request
+ involves more than one upcall, this can lead to complicated state, an
+ extra field "handle_signals" could be added in the message structure
+ to indicate points of no return have been passed.--)
+
+
+
+ 33..11.. IImmpplleemmeennttaattiioonn ddeettaaiillss
+
+ The Unix implementation of this mechanism has been through the
+ implementation of a character device associated with Coda. Venus
+ retrieves messages by doing a read on the device, replies are sent
+ with a write and notification is through the select system call on the
+ file descriptor for the device. The process P is kept waiting on an
+ interruptible wait queue object.
+
+ In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl
+ call is used. The DeviceIoControl call is designed to copy buffers
+ from user memory to kernel memory with OPCODES. The sendmsg_to_kernel
+ is issued as a synchronous call, while the getmsg_from_kernel call is
+ asynchronous. Windows EventObjects are used for notification of
+ message arrival. The process P is kept waiting on a KernelEvent
+ object in NT and a semaphore in Windows 95.
+
+ 0wpage
+
+ 44.. TThhee iinntteerrffaaccee aatt tthhee ccaallll lleevveell
+
+
+ This section describes the upcalls a Coda FS driver can make to Venus.
+ Each of these upcalls make use of two structures: inputArgs and
+ outputArgs. In pseudo BNF form the structures take the following
+ form:
+
+
+ struct inputArgs {
+ u_long opcode;
+ u_long unique; /* Keep multiple outstanding msgs distinct */
+ u_short pid; /* Common to all */
+ u_short pgid; /* Common to all */
+ struct CodaCred cred; /* Common to all */
+
+ <union "in" of call dependent parts of inputArgs>
+ };
+
+ struct outputArgs {
+ u_long opcode;
+ u_long unique; /* Keep multiple outstanding msgs distinct */
+ u_long result;
+
+ <union "out" of call dependent parts of inputArgs>
+ };
+
+
+
+ Before going on let us elucidate the role of the various fields. The
+ inputArgs start with the opcode which defines the type of service
+ requested from Venus. There are approximately 30 upcalls at present
+ which we will discuss. The unique field labels the inputArg with a
+ unique number which will identify the message uniquely. A process and
+ process group id are passed. Finally the credentials of the caller
+ are included.
+
+ Before delving into the specific calls we need to discuss a variety of
+ data structures shared by the kernel and Venus.
+
+
+
+
+ 44..11.. DDaattaa ssttrruuccttuurreess sshhaarreedd bbyy tthhee kkeerrnneell aanndd VVeennuuss
+
+
+ The CodaCred structure defines a variety of user and group ids as
+ they are set for the calling process. The vuid_t and guid_t are 32 bit
+ unsigned integers. It also defines group membership in an array. On
+ Unix the CodaCred has proven sufficient to implement good security
+ semantics for Coda but the structure may have to undergo modification
+ for the Windows environment when these mature.
+
+ struct CodaCred {
+ vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid*/
+ vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */
+ vgid_t cr_groups[NGROUPS]; /* Group membership for caller */
+ };
+
+
+
+ NNOOTTEE It is questionable if we need CodaCreds in Venus. Finally Venus
+ doesn't know about groups, although it does create files with the
+ default uid/gid. Perhaps the list of group membership is superfluous.
+
+
+ The next item is the fundamental identifier used to identify Coda
+ files, the ViceFid. A fid of a file uniquely defines a file or
+ directory in the Coda filesystem within a _c_e_l_l. (-- A _c_e_l_l is a
+ group of Coda servers acting under the aegis of a single system
+ control machine or SCM. See the Coda Administration manual for a
+ detailed description of the role of the SCM.--)
+
+
+ typedef struct ViceFid {
+ VolumeId Volume;
+ VnodeId Vnode;
+ Unique_t Unique;
+ } ViceFid;
+
+
+
+ Each of the constituent fields: VolumeId, VnodeId and Unique_t are
+ unsigned 32 bit integers. We envisage that a further field will need
+ to be prefixed to identify the Coda cell; this will probably take the
+ form of a Ipv6 size IP address naming the Coda cell through DNS.
+
+ The next important structure shared between Venus and the kernel is
+ the attributes of the file. The following structure is used to
+ exchange information. It has room for future extensions such as
+ support for device files (currently not present in Coda).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ struct coda_vattr {
+ enum coda_vtype va_type; /* vnode type (for create) */
+ u_short va_mode; /* files access mode and type */
+ short va_nlink; /* number of references to file */
+ vuid_t va_uid; /* owner user id */
+ vgid_t va_gid; /* owner group id */
+ long va_fsid; /* file system id (dev for now) */
+ long va_fileid; /* file id */
+ u_quad_t va_size; /* file size in bytes */
+ long va_blocksize; /* blocksize preferred for i/o */
+ struct timespec va_atime; /* time of last access */
+ struct timespec va_mtime; /* time of last modification */
+ struct timespec va_ctime; /* time file changed */
+ u_long va_gen; /* generation number of file */
+ u_long va_flags; /* flags defined for file */
+ dev_t va_rdev; /* device special file represents */
+ u_quad_t va_bytes; /* bytes of disk space held by file */
+ u_quad_t va_filerev; /* file modification number */
+ u_int va_vaflags; /* operations flags, see below */
+ long va_spare; /* remain quad aligned */
+ };
+
+
+
+
+ 44..22.. TThhee ppiiooccttll iinntteerrffaaccee
+
+
+ Coda specific requests can be made by application through the pioctl
+ interface. The pioctl is implemented as an ordinary ioctl on a
+ fictitious file /coda/.CONTROL. The pioctl call opens this file, gets
+ a file handle and makes the ioctl call. Finally it closes the file.
+
+ The kernel involvement in this is limited to providing the facility to
+ open and close and pass the ioctl message _a_n_d to verify that a path in
+ the pioctl data buffers is a file in a Coda filesystem.
+
+ The kernel is handed a data packet of the form:
+
+ struct {
+ const char *path;
+ struct ViceIoctl vidata;
+ int follow;
+ } data;
+
+
+
+ where
+
+
+ struct ViceIoctl {
+ caddr_t in, out; /* Data to be transferred in, or out */
+ short in_size; /* Size of input buffer <= 2K */
+ short out_size; /* Maximum size of output buffer, <= 2K */
+ };
+
+
+
+ The path must be a Coda file, otherwise the ioctl upcall will not be
+ made.
+
+ NNOOTTEE The data structures and code are a mess. We need to clean this
+ up.
+
+ We now proceed to document the individual calls:
+
+ 0wpage
+
+ 44..33.. rroooott
+
+
+ AArrgguummeennttss
+
+ iinn empty
+
+ oouutt
+
+ struct cfs_root_out {
+ ViceFid VFid;
+ } cfs_root;
+
+
+
+ DDeessccrriippttiioonn This call is made to Venus during the initialization of
+ the Coda filesystem. If the result is zero, the cfs_root structure
+ contains the ViceFid of the root of the Coda filesystem. If a non-zero
+ result is generated, its value is a platform dependent error code
+ indicating the difficulty Venus encountered in locating the root of
+ the Coda filesystem.
+
+ 0wpage
+
+ 44..44.. llooookkuupp
+
+
+ SSuummmmaarryy Find the ViceFid and type of an object in a directory if it
+ exists.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_lookup_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_lookup;
+
+
+
+ oouutt
+
+ struct cfs_lookup_out {
+ ViceFid VFid;
+ int vtype;
+ } cfs_lookup;
+
+
+
+ DDeessccrriippttiioonn This call is made to determine the ViceFid and filetype of
+ a directory entry. The directory entry requested carries name name
+ and Venus will search the directory identified by cfs_lookup_in.VFid.
+ The result may indicate that the name does not exist, or that
+ difficulty was encountered in finding it (e.g. due to disconnection).
+ If the result is zero, the field cfs_lookup_out.VFid contains the
+ targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the
+ type of object the name designates.
+
+ The name of the object is an 8 bit character string of maximum length
+ CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.)
+
+ It is extremely important to realize that Venus bitwise ors the field
+ cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should
+ not be put in the kernel name cache.
+
+ NNOOTTEE The type of the vtype is currently wrong. It should be
+ coda_vtype. Linux does not take note of CFS_NOCACHE. It should.
+
+ 0wpage
+
+ 44..55.. ggeettaattttrr
+
+
+ SSuummmmaarryy Get the attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_getattr_in {
+ ViceFid VFid;
+ struct coda_vattr attr; /* XXXXX */
+ } cfs_getattr;
+
+
+
+ oouutt
+
+ struct cfs_getattr_out {
+ struct coda_vattr attr;
+ } cfs_getattr;
+
+
+
+ DDeessccrriippttiioonn This call returns the attributes of the file identified by
+ fid.
+
+ EErrrroorrss Errors can occur if the object with fid does not exist, is
+ unaccessible or if the caller does not have permission to fetch
+ attributes.
+
+ NNoottee Many kernel FS drivers (Linux, NT and Windows 95) need to acquire
+ the attributes as well as the Fid for the instantiation of an internal
+ "inode" or "FileHandle". A significant improvement in performance on
+ such systems could be made by combining the _l_o_o_k_u_p and _g_e_t_a_t_t_r calls
+ both at the Venus/kernel interaction level and at the RPC level.
+
+ The vattr structure included in the input arguments is superfluous and
+ should be removed.
+
+ 0wpage
+
+ 44..66.. sseettaattttrr
+
+
+ SSuummmmaarryy Set the attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_setattr_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_setattr;
+
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn The structure attr is filled with attributes to be changed
+ in BSD style. Attributes not to be changed are set to -1, apart from
+ vtype which is set to VNON. Other are set to the value to be assigned.
+ The only attributes which the FS driver may request to change are the
+ mode, owner, groupid, atime, mtime and ctime. The return value
+ indicates success or failure.
+
+ EErrrroorrss A variety of errors can occur. The object may not exist, may
+ be inaccessible, or permission may not be granted by Venus.
+
+ 0wpage
+
+ 44..77.. aacccceessss
+
+
+ SSuummmmaarryy
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_access_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_access;
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn Verify if access to the object identified by VFid for
+ operations described by flags is permitted. The result indicates if
+ access will be granted. It is important to remember that Coda uses
+ ACLs to enforce protection and that ultimately the servers, not the
+ clients enforce the security of the system. The result of this call
+ will depend on whether a _t_o_k_e_n is held by the user.
+
+ EErrrroorrss The object may not exist, or the ACL describing the protection
+ may not be accessible.
+
+ 0wpage
+
+ 44..88.. ccrreeaattee
+
+
+ SSuummmmaarryy Invoked to create a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_create_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ int excl;
+ int mode;
+ char *name; /* Place holder for data. */
+ } cfs_create;
+
+
+
+
+ oouutt
+
+ struct cfs_create_out {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_create;
+
+
+
+ DDeessccrriippttiioonn This upcall is invoked to request creation of a file.
+ The file will be created in the directory identified by VFid, its name
+ will be name, and the mode will be mode. If excl is set an error will
+ be returned if the file already exists. If the size field in attr is
+ set to zero the file will be truncated. The uid and gid of the file
+ are set by converting the CodaCred to a uid using a macro CRTOUID
+ (this macro is platform dependent). Upon success the VFid and
+ attributes of the file are returned. The Coda FS Driver will normally
+ instantiate a vnode, inode or file handle at kernel level for the new
+ object.
+
+
+ EErrrroorrss A variety of errors can occur. Permissions may be insufficient.
+ If the object exists and is not a file the error EISDIR is returned
+ under Unix.
+
+ NNOOTTEE The packing of parameters is very inefficient and appears to
+ indicate confusion between the system call creat and the VFS operation
+ create. The VFS operation create is only called to create new objects.
+ This create call differs from the Unix one in that it is not invoked
+ to return a file descriptor. The truncate and exclusive options,
+ together with the mode, could simply be part of the mode as it is
+ under Unix. There should be no flags argument; this is used in open
+ (2) to return a file descriptor for READ or WRITE mode.
+
+ The attributes of the directory should be returned too, since the size
+ and mtime changed.
+
+ 0wpage
+
+ 44..99.. mmkkddiirr
+
+
+ SSuummmmaarryy Create a new directory.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_mkdir_in {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ char *name; /* Place holder for data. */
+ } cfs_mkdir;
+
+
+
+ oouutt
+
+ struct cfs_mkdir_out {
+ ViceFid VFid;
+ struct coda_vattr attr;
+ } cfs_mkdir;
+
+
+
+
+ DDeessccrriippttiioonn This call is similar to create but creates a directory.
+ Only the mode field in the input parameters is used for creation.
+ Upon successful creation, the attr returned contains the attributes of
+ the new directory.
+
+ EErrrroorrss As for create.
+
+ NNOOTTEE The input parameter should be changed to mode instead of
+ attributes.
+
+ The attributes of the parent should be returned since the size and
+ mtime changes.
+
+ 0wpage
+
+ 44..1100.. lliinnkk
+
+
+ SSuummmmaarryy Create a link to an existing file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_link_in {
+ ViceFid sourceFid; /* cnode to link *to* */
+ ViceFid destFid; /* Directory in which to place link */
+ char *tname; /* Place holder for data. */
+ } cfs_link;
+
+
+
+ oouutt
+ empty
+
+ DDeessccrriippttiioonn This call creates a link to the sourceFid in the directory
+ identified by destFid with name tname. The source must reside in the
+ target's parent, i.e. the source must be have parent destFid, i.e. Coda
+ does not support cross directory hard links. Only the return value is
+ relevant. It indicates success or the type of failure.
+
+ EErrrroorrss The usual errors can occur.0wpage
+
+ 44..1111.. ssyymmlliinnkk
+
+
+ SSuummmmaarryy create a symbolic link
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_symlink_in {
+ ViceFid VFid; /* Directory to put symlink in */
+ char *srcname;
+ struct coda_vattr attr;
+ char *tname;
+ } cfs_symlink;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Create a symbolic link. The link is to be placed in the
+ directory identified by VFid and named tname. It should point to the
+ pathname srcname. The attributes of the newly created object are to
+ be set to attr.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the target directory should be returned since
+ its size changed.
+
+ 0wpage
+
+ 44..1122.. rreemmoovvee
+
+
+ SSuummmmaarryy Remove a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_remove_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_remove;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Remove file named cfs_remove_in.name in directory
+ identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the directory should be returned since its
+ mtime and size may change.
+
+ 0wpage
+
+ 44..1133.. rrmmddiirr
+
+
+ SSuummmmaarryy Remove a directory
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rmdir_in {
+ ViceFid VFid;
+ char *name; /* Place holder for data. */
+ } cfs_rmdir;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Remove the directory with name name from the directory
+ identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The attributes of the parent directory should be returned since
+ its mtime and size may change.
+
+ 0wpage
+
+ 44..1144.. rreeaaddlliinnkk
+
+
+ SSuummmmaarryy Read the value of a symbolic link.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_readlink_in {
+ ViceFid VFid;
+ } cfs_readlink;
+
+
+
+ oouutt
+
+ struct cfs_readlink_out {
+ int count;
+ caddr_t data; /* Place holder for data. */
+ } cfs_readlink;
+
+
+
+ DDeessccrriippttiioonn This routine reads the contents of symbolic link
+ identified by VFid into the buffer data. The buffer data must be able
+ to hold any name up to CFS_MAXNAMLEN (PATH or NAM??).
+
+ EErrrroorrss No unusual errors.
+
+ 0wpage
+
+ 44..1155.. ooppeenn
+
+
+ SSuummmmaarryy Open a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_open_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_open;
+
+
+
+ oouutt
+
+ struct cfs_open_out {
+ dev_t dev;
+ ino_t inode;
+ } cfs_open;
+
+
+
+ DDeessccrriippttiioonn This request asks Venus to place the file identified by
+ VFid in its cache and to note that the calling process wishes to open
+ it with flags as in open(2). The return value to the kernel differs
+ for Unix and Windows systems. For Unix systems the Coda FS Driver is
+ informed of the device and inode number of the container file in the
+ fields dev and inode. For Windows the path of the container file is
+ returned to the kernel.
+ EErrrroorrss
+
+ NNOOTTEE Currently the cfs_open_out structure is not properly adapted to
+ deal with the Windows case. It might be best to implement two
+ upcalls, one to open aiming at a container file name, the other at a
+ container file inode.
+
+ 0wpage
+
+ 44..1166.. cclloossee
+
+
+ SSuummmmaarryy Close a file, update it on the servers.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_close_in {
+ ViceFid VFid;
+ int flags;
+ } cfs_close;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Close the file identified by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE The flags argument is bogus and not used. However, Venus' code
+ has room to deal with an execp input field, probably this field should
+ be used to inform Venus that the file was closed but is still memory
+ mapped for execution. There are comments about fetching versus not
+ fetching the data in Venus vproc_vfscalls. This seems silly. If a
+ file is being closed, the data in the container file is to be the new
+ data. Here again the execp flag might be in play to create confusion:
+ currently Venus might think a file can be flushed from the cache when
+ it is still memory mapped. This needs to be understood.
+
+ 0wpage
+
+ 44..1177.. iiooccttll
+
+
+ SSuummmmaarryy Do an ioctl on a file. This includes the pioctl interface.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_ioctl_in {
+ ViceFid VFid;
+ int cmd;
+ int len;
+ int rwflag;
+ char *data; /* Place holder for data. */
+ } cfs_ioctl;
+
+
+
+ oouutt
+
+
+ struct cfs_ioctl_out {
+ int len;
+ caddr_t data; /* Place holder for data. */
+ } cfs_ioctl;
+
+
+
+ DDeessccrriippttiioonn Do an ioctl operation on a file. The command, len and
+ data arguments are filled as usual. flags is not used by Venus.
+
+ EErrrroorrss
+
+ NNOOTTEE Another bogus parameter. flags is not used. What is the
+ business about PREFETCHING in the Venus code?
+
+
+ 0wpage
+
+ 44..1188.. rreennaammee
+
+
+ SSuummmmaarryy Rename a fid.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rename_in {
+ ViceFid sourceFid;
+ char *srcname;
+ ViceFid destFid;
+ char *destname;
+ } cfs_rename;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Rename the object with name srcname in directory
+ sourceFid to destname in destFid. It is important that the names
+ srcname and destname are 0 terminated strings. Strings in Unix
+ kernels are not always null terminated.
+
+ EErrrroorrss
+
+ 0wpage
+
+ 44..1199.. rreeaaddddiirr
+
+
+ SSuummmmaarryy Read directory entries.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_readdir_in {
+ ViceFid VFid;
+ int count;
+ int offset;
+ } cfs_readdir;
+
+
+
+
+ oouutt
+
+ struct cfs_readdir_out {
+ int size;
+ caddr_t data; /* Place holder for data. */
+ } cfs_readdir;
+
+
+
+ DDeessccrriippttiioonn Read directory entries from VFid starting at offset and
+ read at most count bytes. Returns the data in data and returns
+ the size in size.
+
+ EErrrroorrss
+
+ NNOOTTEE This call is not used. Readdir operations exploit container
+ files. We will re-evaluate this during the directory revamp which is
+ about to take place.
+
+ 0wpage
+
+ 44..2200.. vvggeett
+
+
+ SSuummmmaarryy instructs Venus to do an FSDB->Get.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_vget_in {
+ ViceFid VFid;
+ } cfs_vget;
+
+
+
+ oouutt
+
+ struct cfs_vget_out {
+ ViceFid VFid;
+ int vtype;
+ } cfs_vget;
+
+
+
+ DDeessccrriippttiioonn This upcall asks Venus to do a get operation on an fsobj
+ labelled by VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE This operation is not used. However, it is extremely useful
+ since it can be used to deal with read/write memory mapped files.
+ These can be "pinned" in the Venus cache using vget and released with
+ inactive.
+
+ 0wpage
+
+ 44..2211.. ffssyynncc
+
+
+ SSuummmmaarryy Tell Venus to update the RVM attributes of a file.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_fsync_in {
+ ViceFid VFid;
+ } cfs_fsync;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This
+ should be called as part of kernel level fsync type calls. The
+ result indicates if the syncing was successful.
+
+ EErrrroorrss
+
+ NNOOTTEE Linux does not implement this call. It should.
+
+ 0wpage
+
+ 44..2222.. iinnaaccttiivvee
+
+
+ SSuummmmaarryy Tell Venus a vnode is no longer in use.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_inactive_in {
+ ViceFid VFid;
+ } cfs_inactive;
+
+
+
+ oouutt
+ none
+
+ DDeessccrriippttiioonn This operation returns EOPNOTSUPP.
+
+ EErrrroorrss
+
+ NNOOTTEE This should perhaps be removed.
+
+ 0wpage
+
+ 44..2233.. rrddwwrr
+
+
+ SSuummmmaarryy Read or write from a file
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct cfs_rdwr_in {
+ ViceFid VFid;
+ int rwflag;
+ int count;
+ int offset;
+ int ioflag;
+ caddr_t data; /* Place holder for data. */
+ } cfs_rdwr;
+
+
+
+
+ oouutt
+
+ struct cfs_rdwr_out {
+ int rwflag;
+ int count;
+ caddr_t data; /* Place holder for data. */
+ } cfs_rdwr;
+
+
+
+ DDeessccrriippttiioonn This upcall asks Venus to read or write from a file.
+
+ EErrrroorrss
+
+ NNOOTTEE It should be removed since it is against the Coda philosophy that
+ read/write operations never reach Venus. I have been told the
+ operation does not work. It is not currently used.
+
+
+ 0wpage
+
+ 44..2244.. ooddyymmoouunntt
+
+
+ SSuummmmaarryy Allows mounting multiple Coda "filesystems" on one Unix mount
+ point.
+
+ AArrgguummeennttss
+
+ iinn
+
+ struct ody_mount_in {
+ char *name; /* Place holder for data. */
+ } ody_mount;
+
+
+
+ oouutt
+
+ struct ody_mount_out {
+ ViceFid VFid;
+ } ody_mount;
+
+
+
+ DDeessccrriippttiioonn Asks Venus to return the rootfid of a Coda system named
+ name. The fid is returned in VFid.
+
+ EErrrroorrss
+
+ NNOOTTEE This call was used by David for dynamic sets. It should be
+ removed since it causes a jungle of pointers in the VFS mounting area.
+ It is not used by Coda proper. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2255.. ooddyy__llooookkuupp
+
+
+ SSuummmmaarryy Looks up something.
+
+ AArrgguummeennttss
+
+ iinn irrelevant
+
+
+ oouutt
+ irrelevant
+
+ DDeessccrriippttiioonn
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2266.. ooddyy__eexxppaanndd
+
+
+ SSuummmmaarryy expands something in a dynamic set.
+
+ AArrgguummeennttss
+
+ iinn irrelevant
+
+ oouutt
+ irrelevant
+
+ DDeessccrriippttiioonn
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. Call is not implemented by Venus.
+
+ 0wpage
+
+ 44..2277.. pprreeffeettcchh
+
+
+ SSuummmmaarryy Prefetch a dynamic set.
+
+ AArrgguummeennttss
+
+ iinn Not documented.
+
+ oouutt
+ Not documented.
+
+ DDeessccrriippttiioonn Venus worker.cc has support for this call, although it is
+ noted that it doesn't work. Not surprising, since the kernel does not
+ have support for it. (ODY_PREFETCH is not a defined operation).
+
+ EErrrroorrss
+
+ NNOOTTEE Gut it. It isn't working and isn't used by Coda.
+
+
+ 0wpage
+
+ 44..2288.. ssiiggnnaall
+
+
+ SSuummmmaarryy Send Venus a signal about an upcall.
+
+ AArrgguummeennttss
+
+ iinn none
+
+ oouutt
+ not applicable.
+
+ DDeessccrriippttiioonn This is an out-of-band upcall to Venus to inform Venus
+ that the calling process received a signal after Venus read the
+ message from the input queue. Venus is supposed to clean up the
+ operation.
+
+ EErrrroorrss No reply is given.
+
+ NNOOTTEE We need to better understand what Venus needs to clean up and if
+ it is doing this correctly. Also we need to handle multiple upcall
+ per system call situations correctly. It would be important to know
+ what state changes in Venus take place after an upcall for which the
+ kernel is responsible for notifying Venus to clean up (e.g. open
+ definitely is such a state change, but many others are maybe not).
+
+ 0wpage
+
+ 55.. TThhee mmiinniiccaacchhee aanndd ddoowwnnccaallllss
+
+
+ The Coda FS Driver can cache results of lookup and access upcalls, to
+ limit the frequency of upcalls. Upcalls carry a price since a process
+ context switch needs to take place. The counterpart of caching the
+ information is that Venus will notify the FS Driver that cached
+ entries must be flushed or renamed.
+
+ The kernel code generally has to maintain a structure which links the
+ internal file handles (called vnodes in BSD, inodes in Linux and
+ FileHandles in Windows) with the ViceFid's which Venus maintains. The
+ reason is that frequent translations back and forth are needed in
+ order to make upcalls and use the results of upcalls. Such linking
+ objects are called ccnnooddeess.
+
+ The current minicache implementations have cache entries which record
+ the following:
+
+ 1. the name of the file
+
+ 2. the cnode of the directory containing the object
+
+ 3. a list of CodaCred's for which the lookup is permitted.
+
+ 4. the cnode of the object
+
+ The lookup call in the Coda FS Driver may request the cnode of the
+ desired object from the cache, by passing its name, directory and the
+ CodaCred's of the caller. The cache will return the cnode or indicate
+ that it cannot be found. The Coda FS Driver must be careful to
+ invalidate cache entries when it modifies or removes objects.
+
+ When Venus obtains information that indicates that cache entries are
+ no longer valid, it will make a downcall to the kernel. Downcalls are
+ intercepted by the Coda FS Driver and lead to cache invalidations of
+ the kind described below. The Coda FS Driver does not return an error
+ unless the downcall data could not be read into kernel memory.
+
+
+ 55..11.. IINNVVAALLIIDDAATTEE
+
+
+ No information is available on this call.
+
+
+ 55..22.. FFLLUUSSHH
+
+
+
+ AArrgguummeennttss None
+
+ SSuummmmaarryy Flush the name cache entirely.
+
+ DDeessccrriippttiioonn Venus issues this call upon startup and when it dies. This
+ is to prevent stale cache information being held. Some operating
+ systems allow the kernel name cache to be switched off dynamically.
+ When this is done, this downcall is made.
+
+
+ 55..33.. PPUURRGGEEUUSSEERR
+
+
+ AArrgguummeennttss
+
+ struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */
+ struct CodaCred cred;
+ } cfs_purgeuser;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache carrying the Cred. This
+ call is issued when tokens for a user expire or are flushed.
+
+
+ 55..44.. ZZAAPPFFIILLEE
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_zapfile;
+
+
+
+ DDeessccrriippttiioonn Remove all entries which have the (dir vnode, name) pair.
+ This is issued as a result of an invalidation of cached attributes of
+ a vnode.
+
+ NNOOTTEE Call is not named correctly in NetBSD and Mach. The minicache
+ zapfile routine takes different arguments. Linux does not implement
+ the invalidation of attributes correctly.
+
+
+
+ 55..55.. ZZAAPPDDIIRR
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_zapdir;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache lying in a directory
+ CodaFid, and all children of this directory. This call is issued when
+ Venus receives a callback on the directory.
+
+
+ 55..66.. ZZAAPPVVNNOODDEE
+
+
+
+ AArrgguummeennttss
+
+ struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */
+ struct CodaCred cred;
+ ViceFid VFid;
+ } cfs_zapvnode;
+
+
+
+ DDeessccrriippttiioonn Remove all entries in the cache carrying the cred and VFid
+ as in the arguments. This downcall is probably never issued.
+
+
+ 55..77.. PPUURRGGEEFFIIDD
+
+
+ SSuummmmaarryy
+
+ AArrgguummeennttss
+
+ struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */
+ ViceFid CodaFid;
+ } cfs_purgefid;
+
+
+
+ DDeessccrriippttiioonn Flush the attribute for the file. If it is a dir (odd
+ vnode), purge its children from the namecache and remove the file from the
+ namecache.
+
+
+
+ 55..88.. RREEPPLLAACCEE
+
+
+ SSuummmmaarryy Replace the Fid's for a collection of names.
+
+ AArrgguummeennttss
+
+ struct cfs_replace_out { /* cfs_replace is a venus->kernel call */
+ ViceFid NewFid;
+ ViceFid OldFid;
+ } cfs_replace;
+
+
+
+ DDeessccrriippttiioonn This routine replaces a ViceFid in the name cache with
+ another. It is added to allow Venus during reintegration to replace
+ locally allocated temp fids while disconnected with global fids even
+ when the reference counts on those fids are not zero.
+
+ 0wpage
+
+ 66.. IInniittiiaalliizzaattiioonn aanndd cclleeaannuupp
+
+
+ This section gives brief hints as to desirable features for the Coda
+ FS Driver at startup and upon shutdown or Venus failures. Before
+ entering the discussion it is useful to repeat that the Coda FS Driver
+ maintains the following data:
+
+
+ 1. message queues
+
+ 2. cnodes
+
+ 3. name cache entries
+
+ The name cache entries are entirely private to the driver, so they
+ can easily be manipulated. The message queues will generally have
+ clear points of initialization and destruction. The cnodes are
+ much more delicate. User processes hold reference counts in Coda
+ filesystems and it can be difficult to clean up the cnodes.
+
+ It can expect requests through:
+
+ 1. the message subsystem
+
+ 2. the VFS layer
+
+ 3. pioctl interface
+
+ Currently the _p_i_o_c_t_l passes through the VFS for Coda so we can
+ treat these similarly.
+
+
+ 66..11.. RReeqquuiirreemmeennttss
+
+
+ The following requirements should be accommodated:
+
+ 1. The message queues should have open and close routines. On Unix
+ the opening of the character devices are such routines.
+
+ +o Before opening, no messages can be placed.
+
+ +o Opening will remove any old messages still pending.
+
+ +o Close will notify any sleeping processes that their upcall cannot
+ be completed.
+
+ +o Close will free all memory allocated by the message queues.
+
+
+ 2. At open the namecache shall be initialized to empty state.
+
+ 3. Before the message queues are open, all VFS operations will fail.
+ Fortunately this can be achieved by making sure than mounting the
+ Coda filesystem cannot succeed before opening.
+
+ 4. After closing of the queues, no VFS operations can succeed. Here
+ one needs to be careful, since a few operations (lookup,
+ read/write, readdir) can proceed without upcalls. These must be
+ explicitly blocked.
+
+ 5. Upon closing the namecache shall be flushed and disabled.
+
+ 6. All memory held by cnodes can be freed without relying on upcalls.
+
+ 7. Unmounting the file system can be done without relying on upcalls.
+
+ 8. Mounting the Coda filesystem should fail gracefully if Venus cannot
+ get the rootfid or the attributes of the rootfid. The latter is
+ best implemented by Venus fetching these objects before attempting
+ to mount.
+
+ NNOOTTEE NetBSD in particular but also Linux have not implemented the
+ above requirements fully. For smooth operation this needs to be
+ corrected.
+
+
+
diff --git a/Documentation/filesystems/configfs/Makefile b/Documentation/filesystems/configfs/Makefile
new file mode 100644
index 00000000000..be7ec5e67db
--- /dev/null
+++ b/Documentation/filesystems/configfs/Makefile
@@ -0,0 +1,3 @@
+ifneq ($(CONFIG_CONFIGFS_FS),)
+obj-m += configfs_example_explicit.o configfs_example_macros.o
+endif
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 00000000000..b40fec9d3f5
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,484 @@
+
+configfs - Userspace-driven kernel object configuration.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+ Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality. Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs. Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2). It may allow some attributes to be modified via
+write(2). The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2). It is destroyed via rmdir(2). The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together. Unlike sysfs, the
+lifetime of the representation is completely driven by userspace. The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system. One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel. You can access
+it by doing
+
+ mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems. Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config. Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2). The item's attributes will also
+appear at this time. readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values. Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file. The same efficiency caveats from sysfs
+apply. Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once. When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back. Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2). An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)). Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices. Call it FakeNBD. FakeNBD uses configfs
+for its configuration. Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it. Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+ # ls /config
+ fakenbd
+
+A fakenbd connection can be created with mkdir(2). The name is
+arbitrary, but likely the tool will make some use of the name. Perhaps
+it is a uuid or a disk name:
+
+ # mkdir /config/fakenbd/disk1
+ # ls /config/fakenbd/disk1
+ target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to. The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+ # echo 10.0.0.1 > /config/fakenbd/disk1/target
+ # echo /dev/sda1 > /config/fakenbd/disk1/device
+ # echo 1 > /config/fakenbd/disk1/rw
+
+That's it. That's all there is. Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item. A config_item reflects an
+object in the subsystem. It has attributes that match values on that
+object. configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group. A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that. The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module. During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem. A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+ struct config_item {
+ char *ci_name;
+ char ci_namebuf[UOBJ_NAME_LEN];
+ struct kref ci_kref;
+ struct list_head ci_entry;
+ struct config_item *ci_parent;
+ struct config_group *ci_group;
+ struct config_item_type *ci_type;
+ struct dentry *ci_dentry;
+ };
+
+ void config_item_init(struct config_item *);
+ void config_item_init_type_name(struct config_item *,
+ const char *name,
+ struct config_item_type *type);
+ struct config_item *config_item_get(struct config_item *);
+ void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing. The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it. This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things. For that, it needs a type.
+
+[struct config_item_type]
+
+ struct configfs_item_operations {
+ void (*release)(struct config_item *);
+ ssize_t (*show_attribute)(struct config_item *,
+ struct configfs_attribute *,
+ char *);
+ ssize_t (*store_attribute)(struct config_item *,
+ struct configfs_attribute *,
+ const char *, size_t);
+ int (*allow_link)(struct config_item *src,
+ struct config_item *target);
+ int (*drop_link)(struct config_item *src,
+ struct config_item *target);
+ };
+
+ struct config_item_type {
+ struct module *ct_owner;
+ struct configfs_item_operations *ct_item_ops;
+ struct configfs_group_operations *ct_group_ops;
+ struct configfs_attribute **ct_attrs;
+ };
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item. All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method. This method is called when the config_item's reference count
+reaches zero. Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method. Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+ struct configfs_attribute {
+ char *ca_name;
+ struct module *ca_owner;
+ umode_t ca_mode;
+ };
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs. When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename. configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute. The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vacuum. The only way one can be created
+is via mkdir(2) on a config_group. This will trigger creation of a
+child item.
+
+ struct config_group {
+ struct config_item cg_item;
+ struct list_head cg_children;
+ struct configfs_subsystem *cg_subsys;
+ struct config_group **default_groups;
+ };
+
+ void config_group_init(struct config_group *group);
+ void config_group_init_type_name(struct config_group *group,
+ const char *name,
+ struct config_item_type *type);
+
+
+The config_group structure contains a config_item. Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups. This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+ struct configfs_group_operations {
+ struct config_item *(*make_item)(struct config_group *group,
+ const char *name);
+ struct config_group *(*make_group)(struct config_group *group,
+ const char *name);
+ int (*commit_item)(struct config_item *item);
+ void (*disconnect_notify)(struct config_group *group,
+ struct config_item *item);
+ void (*drop_item)(struct config_group *group,
+ struct config_item *item);
+ };
+
+A group creates child items by providing the
+ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs. Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group(). Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called. As a config_group is also a
+config_item, it is not necessary for a separate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation. If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy). The subsystem is
+responsible for responding to this. If the subsystem has references to
+the item in other threads, the memory is safe. It may take some time
+for the item to actually disappear from the subsystem's usage. But it
+is gone from configfs.
+
+When drop_item() is called, the item's linkage has already been torn
+down. It no longer has a reference on its parent and has no place in
+the item hierarchy. If a client needs to do some cleanup before this
+teardown happens, the subsystem can implement the
+ct_group_ops->disconnect_notify() method. The method is called after
+configfs has removed the item from the filesystem view but before the
+item is removed from its parent group. Like drop_item(),
+disconnect_notify() is void and cannot fail. Client subsystems should
+not drop any references here, as they still must do it in drop_item().
+
+A config_group cannot be removed while it still has child items. This
+is implemented in the configfs rmdir(2) code. ->drop_item() will not be
+called, as the item has not been dropped. rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, usually at module_init time. This
+tells configfs to make the subsystem appear in the file tree.
+
+ struct configfs_subsystem {
+ struct config_group su_group;
+ struct mutex su_mutex;
+ };
+
+ int configfs_register_subsystem(struct configfs_subsystem *subsys);
+ void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+ A subsystem consists of a toplevel config_group and a mutex.
+The group is where child config_items are created. For a subsystem,
+this group is usually defined statically. Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the mutex.
+ When the register call returns, the subsystem is live, and it
+will be visible via configfs. At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example_explicit.c
+and configfs_example_macros.c. It shows a trivial object displaying and
+storing an attribute, and a simple group creating and destroying these
+children.
+
+The only difference between configfs_example_explicit.c and
+configfs_example_macros.c is how the attributes of the childless item
+are defined. The childless item has extended attributes, each with
+their own show()/store() operation. This follows a convention commonly
+used in sysfs. configfs_example_explicit.c creates these attributes
+by explicitly defining the structures involved. Conversely
+configfs_example_macros.c uses some convenience macros from configfs.h
+to define the attributes. These macros are similar to their sysfs
+counterparts.
+
+[Hierarchy Navigation and the Subsystem Mutex]
+
+There is an extra bonus that configfs provides. The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem. A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy. For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem. This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem mutex to
+protect modifications. Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+mutex.
+
+A subsystem will be prevented from acquiring the mutex while a newly
+allocated item has not been linked into this hierarchy. Similarly, it
+will not be able to acquire the mutex while a dropping item has not
+yet been unlinked. This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration. This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+mutex.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship. Often, however, a larger environment requires aggregation
+outside of the parent/child connection. This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods. If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items. Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item. If the source item
+allows linking to target item, it returns 0. A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method. Like the ->drop_item() method,
+this is a void function and cannot return failure. The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it. Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation. Thus,
+mkdir("parent") results in "parent", "parent/subgroup1", up through
+"parent/subgroupN". Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group. If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group. Similarly, they are removed at the same time
+as the parent. No extra notification is provided. When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2). They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Dependent Subsystems]
+
+Sometimes other drivers depend on particular configfs items. For
+example, ocfs2 mounts depend on a heartbeat region item. If that
+region item is removed with rmdir(2), the ocfs2 mount must BUG or go
+readonly. Not happy.
+
+configfs provides two additional API calls: configfs_depend_item() and
+configfs_undepend_item(). A client driver can call
+configfs_depend_item() on an existing item to tell configfs that it is
+depended on. configfs will then return -EBUSY from rmdir(2) for that
+item. When the item is no longer depended on, the client driver calls
+configfs_undepend_item() on it.
+
+These API cannot be called underneath any configfs callbacks, as
+they will conflict. They can block and allocate. A client driver
+probably shouldn't calling them of its own gumption. Rather it should
+be providing an API that external subsystems call.
+
+How does this work? Imagine the ocfs2 mount process. When it mounts,
+it asks for a heartbeat region item. This is done via a call into the
+heartbeat code. Inside the heartbeat code, the region item is looked
+up. Here, the heartbeat code calls configfs_depend_item(). If it
+succeeds, then heartbeat knows the region is safe to give to ocfs2.
+If it fails, it was being torn down anyway, and heartbeat can gracefully
+pass up an error.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state. That is, no
+default values can be specified for the item's attributes such that the
+item can do its work. Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above. Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects. This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized. Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go. More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attributes are
+initialized in a way that makes sense. configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations. An item is
+committed via rename(2). The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items. When this group appears in configfs, mkdir(2) will
+not work directly in the group. Instead, the group will have two
+subdirectories: "live" and "pending". The "live" directory does not
+support mkdir(2) or rmdir(2) either. It only allows rename(2). The
+"pending" directory does allow mkdir(2) and rmdir(2). An item is
+created in the "pending" directory. Its attributes can be modified at
+will. Userspace commits the item by renaming it into the "live"
+directory. At this point, the subsystem receives the ->commit_item()
+callback. If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted". Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one. The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example_explicit.c b/Documentation/filesystems/configfs/configfs_example_explicit.c
new file mode 100644
index 00000000000..1420233dfa5
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example_explicit.c
@@ -0,0 +1,483 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_explicit.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It explicitly defines
+ * each structure without using the helper macros defined in
+ * configfs.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+struct childless_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct childless *, char *);
+ ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+ char *page)
+{
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+ const char *page,
+ size_t count)
+{
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if ((*p != '\0') && (*p != '\n'))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ childless->storeme = tmp;
+
+ return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+ .show = childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+ .show = childless_storeme_read,
+ .store = childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+ .attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+ .show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme.attr,
+ &childless_attr_storeme.attr,
+ &childless_attr_description.attr,
+ NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ struct childless *childless = to_childless(item);
+ struct childless_attribute *childless_attr =
+ container_of(attr, struct childless_attribute, attr);
+ ssize_t ret = 0;
+
+ if (childless_attr->show)
+ ret = childless_attr->show(childless, page);
+ return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct childless *childless = to_childless(item);
+ struct childless_attribute *childless_attr =
+ container_of(attr, struct childless_attribute, attr);
+ ssize_t ret = -EINVAL;
+
+ if (childless_attr->store)
+ ret = childless_attr->store(childless, page, count);
+ return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+ .show_attribute = childless_attr_show,
+ .store_attribute = childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+ .ct_item_ops = &childless_item_ops,
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "storeme",
+ .ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ ssize_t count;
+ struct simple_child *simple_child = to_simple_child(item);
+
+ count = sprintf(page, "%d\n", simple_child->storeme);
+
+ return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ simple_child->storeme = tmp;
+
+ return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+ .show_attribute = simple_child_attr_show,
+ .store_attribute = simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ simple_child->storeme = 0;
+
+ return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+ .show_attribute = simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+ .show_attribute = group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+ .ct_item_ops = &group_children_item_ops,
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ int ret;
+ int i;
+ struct configfs_subsystem *subsys;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ printk(KERN_ERR "Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (i--; i >= 0; i--)
+ configfs_unregister_subsystem(example_subsys[i]);
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++)
+ configfs_unregister_subsystem(example_subsys[i]);
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/configfs/configfs_example_macros.c b/Documentation/filesystems/configfs/configfs_example_macros.c
new file mode 100644
index 00000000000..327dfbc640a
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example_macros.c
@@ -0,0 +1,446 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_macros.c - This file is a demonstration module
+ * containing a number of configfs subsystems. It uses the helper
+ * macros defined by configfs.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle. All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem. It cannot create
+ * any config_items. It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem. See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+ struct configfs_subsystem subsys;
+ int showme;
+ int storeme;
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+CONFIGFS_ATTR_STRUCT(childless);
+#define CHILDLESS_ATTR(_name, _mode, _show, _store) \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR(_name, _mode, _show, _store)
+#define CHILDLESS_ATTR_RO(_name, _show) \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR_RO(_name, _show);
+
+static ssize_t childless_showme_read(struct childless *childless,
+ char *page)
+{
+ ssize_t pos;
+
+ pos = sprintf(page, "%d\n", childless->showme);
+ childless->showme++;
+
+ return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+ const char *page,
+ size_t count)
+{
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ childless->storeme = tmp;
+
+ return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+ char *page)
+{
+ return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs. It does not support the creation of child config_items.\n"
+"It only has a few attributes. In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+CHILDLESS_ATTR_RO(showme, childless_showme_read);
+CHILDLESS_ATTR(storeme, S_IRUGO | S_IWUSR, childless_storeme_read,
+ childless_storeme_write);
+CHILDLESS_ATTR_RO(description, childless_description_read);
+
+static struct configfs_attribute *childless_attrs[] = {
+ &childless_attr_showme.attr,
+ &childless_attr_storeme.attr,
+ &childless_attr_description.attr,
+ NULL,
+};
+
+CONFIGFS_ATTR_OPS(childless);
+static struct configfs_item_operations childless_item_ops = {
+ .show_attribute = childless_attr_show,
+ .store_attribute = childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+ .ct_item_ops = &childless_item_ops,
+ .ct_attrs = childless_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "01-childless",
+ .ci_type = &childless_type,
+ },
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child. Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go. Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+ struct config_item item;
+ int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+ return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "storeme",
+ .ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+ &simple_child_attr_storeme,
+ NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ ssize_t count;
+ struct simple_child *simple_child = to_simple_child(item);
+
+ count = sprintf(page, "%d\n", simple_child->storeme);
+
+ return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct simple_child *simple_child = to_simple_child(item);
+ unsigned long tmp;
+ char *p = (char *) page;
+
+ tmp = simple_strtoul(p, &p, 10);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp > INT_MAX)
+ return -ERANGE;
+
+ simple_child->storeme = tmp;
+
+ return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+ kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+ .release = simple_child_release,
+ .show_attribute = simple_child_attr_show,
+ .store_attribute = simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+ .ct_item_ops = &simple_child_item_ops,
+ .ct_attrs = simple_child_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+
+struct simple_children {
+ struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+ struct simple_child *simple_child;
+
+ simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+ if (!simple_child)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&simple_child->item, name,
+ &simple_child_type);
+
+ simple_child->storeme = 0;
+
+ return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+ &simple_children_attr_description,
+ NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items. These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static void simple_children_release(struct config_item *item)
+{
+ kfree(to_simple_children(item));
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+ .release = simple_children_release,
+ .show_attribute = simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+ .make_item = simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+ .ct_item_ops = &simple_children_item_ops,
+ .ct_group_ops = &simple_children_group_ops,
+ .ct_attrs = simple_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "02-simple-children",
+ .ci_type = &simple_children_type,
+ },
+ },
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above. However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem. Creation of a group in the subsystem creates
+ * a new simple_children group. That group can then have simple_child
+ * children of its own.
+ */
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+ struct simple_children *simple_children;
+
+ simple_children = kzalloc(sizeof(struct simple_children),
+ GFP_KERNEL);
+ if (!simple_children)
+ return ERR_PTR(-ENOMEM);
+
+ config_group_init_type_name(&simple_children->group, name,
+ &simple_children_type);
+
+ return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+ .ca_owner = THIS_MODULE,
+ .ca_name = "description",
+ .ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+ &group_children_attr_description,
+ NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups. These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+ .show_attribute = group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+ .make_group = group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+ .ct_item_ops = &group_children_item_ops,
+ .ct_group_ops = &group_children_group_ops,
+ .ct_attrs = group_children_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "03-group-children",
+ .ci_type = &group_children_type,
+ },
+ },
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all. It
+ * allows the init function to easily register them. Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+ &childless_subsys.subsys,
+ &simple_children_subsys,
+ &group_children_subsys,
+ NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+ int ret;
+ int i;
+ struct configfs_subsystem *subsys;
+
+ for (i = 0; example_subsys[i]; i++) {
+ subsys = example_subsys[i];
+
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ printk(KERN_ERR "Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+ }
+
+ return 0;
+
+out_unregister:
+ for (i--; i >= 0; i--)
+ configfs_unregister_subsystem(example_subsys[i]);
+
+ return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+ int i;
+
+ for (i = 0; example_subsys[i]; i++)
+ configfs_unregister_subsystem(example_subsys[i]);
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/cramfs.txt b/Documentation/filesystems/cramfs.txt
new file mode 100644
index 00000000000..31f53f0ab95
--- /dev/null
+++ b/Documentation/filesystems/cramfs.txt
@@ -0,0 +1,76 @@
+
+ Cramfs - cram a filesystem onto a small ROM
+
+cramfs is designed to be simple and small, and to compress things well.
+
+It uses the zlib routines to compress a file one page at a time, and
+allows random page access. The meta-data is not compressed, but is
+expressed in a very terse representation to make it use much less
+diskspace than traditional filesystems.
+
+You can't write to a cramfs filesystem (making it compressible and
+compact also makes it _very_ hard to update on-the-fly), so you have to
+create the disk image with the "mkcramfs" utility.
+
+
+Usage Notes
+-----------
+
+File sizes are limited to less than 16MB.
+
+Maximum filesystem size is a little over 256MB. (The last file on the
+filesystem is allowed to extend past 256MB.)
+
+Only the low 8 bits of gid are stored. The current version of
+mkcramfs simply truncates to 8 bits, which is a potential security
+issue.
+
+Hard links are supported, but hard linked files
+will still have a link count of 1 in the cramfs image.
+
+Cramfs directories have no `.' or `..' entries. Directories (like
+every other file on cramfs) always have a link count of 1. (There's
+no need to use -noleaf in `find', btw.)
+
+No timestamps are stored in a cramfs, so these default to the epoch
+(1970 GMT). Recently-accessed files may have updated timestamps, but
+the update lasts only as long as the inode is cached in memory, after
+which the timestamp reverts to 1970, i.e. moves backwards in time.
+
+Currently, cramfs must be written and read with architectures of the
+same endianness, and can be read only by kernels with PAGE_CACHE_SIZE
+== 4096. At least the latter of these is a bug, but it hasn't been
+decided what the best fix is. For the moment if you have larger pages
+you can just change the #define in mkcramfs.c, so long as you don't
+mind the filesystem becoming unreadable to future kernels.
+
+
+For /usr/share/magic
+--------------------
+
+0 ulelong 0x28cd3d45 Linux cramfs offset 0
+>4 ulelong x size %d
+>8 ulelong x flags 0x%x
+>12 ulelong x future 0x%x
+>16 string >\0 signature "%.16s"
+>32 ulelong x fsid.crc 0x%x
+>36 ulelong x fsid.edition %d
+>40 ulelong x fsid.blocks %d
+>44 ulelong x fsid.files %d
+>48 string >\0 name "%.16s"
+512 ulelong 0x28cd3d45 Linux cramfs offset 512
+>516 ulelong x size %d
+>520 ulelong x flags 0x%x
+>524 ulelong x future 0x%x
+>528 string >\0 signature "%.16s"
+>544 ulelong x fsid.crc 0x%x
+>548 ulelong x fsid.edition %d
+>552 ulelong x fsid.blocks %d
+>556 ulelong x fsid.files %d
+>560 string >\0 name "%.16s"
+
+
+Hacker Notes
+------------
+
+See fs/cramfs/README for filesystem layout and implementation notes.
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
new file mode 100644
index 00000000000..6872c91bce3
--- /dev/null
+++ b/Documentation/filesystems/debugfs.txt
@@ -0,0 +1,188 @@
+Copyright 2009 Jonathan Corbet <corbet@lwn.net>
+
+Debugfs exists as a simple way for kernel developers to make information
+available to user space. Unlike /proc, which is only meant for information
+about a process, or sysfs, which has strict one-value-per-file rules,
+debugfs has no rules at all. Developers can put any information they want
+there. The debugfs filesystem is also intended to not serve as a stable
+ABI to user space; in theory, there are no stability constraints placed on
+files exported there. The real world is not always so simple, though [1];
+even debugfs interfaces are best designed with the idea that they will need
+to be maintained forever.
+
+Debugfs is typically mounted with a command like:
+
+ mount -t debugfs none /sys/kernel/debug
+
+(Or an equivalent /etc/fstab line).
+
+Note that the debugfs API is exported GPL-only to modules.
+
+Code using debugfs should include <linux/debugfs.h>. Then, the first order
+of business will be to create at least one directory to hold a set of
+debugfs files:
+
+ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
+
+This call, if successful, will make a directory called name underneath the
+indicated parent directory. If parent is NULL, the directory will be
+created in the debugfs root. On success, the return value is a struct
+dentry pointer which can be used to create files in the directory (and to
+clean it up at the end). A NULL return value indicates that something went
+wrong. If ERR_PTR(-ENODEV) is returned, that is an indication that the
+kernel has been built without debugfs support and none of the functions
+described below will work.
+
+The most general way to create a file within a debugfs directory is with:
+
+ struct dentry *debugfs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops);
+
+Here, name is the name of the file to create, mode describes the access
+permissions the file should have, parent indicates the directory which
+should hold the file, data will be stored in the i_private field of the
+resulting inode structure, and fops is a set of file operations which
+implement the file's behavior. At a minimum, the read() and/or write()
+operations should be provided; others can be included as needed. Again,
+the return value will be a dentry pointer to the created file, NULL for
+error, or ERR_PTR(-ENODEV) if debugfs support is missing.
+
+In a number of cases, the creation of a set of file operations is not
+actually necessary; the debugfs code provides a number of helper functions
+for simple situations. Files containing a single integer value can be
+created with any of:
+
+ struct dentry *debugfs_create_u8(const char *name, umode_t mode,
+ struct dentry *parent, u8 *value);
+ struct dentry *debugfs_create_u16(const char *name, umode_t mode,
+ struct dentry *parent, u16 *value);
+ struct dentry *debugfs_create_u32(const char *name, umode_t mode,
+ struct dentry *parent, u32 *value);
+ struct dentry *debugfs_create_u64(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value);
+
+These files support both reading and writing the given value; if a specific
+file should not be written to, simply set the mode bits accordingly. The
+values in these files are in decimal; if hexadecimal is more appropriate,
+the following functions can be used instead:
+
+ struct dentry *debugfs_create_x8(const char *name, umode_t mode,
+ struct dentry *parent, u8 *value);
+ struct dentry *debugfs_create_x16(const char *name, umode_t mode,
+ struct dentry *parent, u16 *value);
+ struct dentry *debugfs_create_x32(const char *name, umode_t mode,
+ struct dentry *parent, u32 *value);
+ struct dentry *debugfs_create_x64(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value);
+
+These functions are useful as long as the developer knows the size of the
+value to be exported. Some types can have different widths on different
+architectures, though, complicating the situation somewhat. There is a
+function meant to help out in one special case:
+
+ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
+ struct dentry *parent,
+ size_t *value);
+
+As might be expected, this function will create a debugfs file to represent
+a variable of type size_t.
+
+Boolean values can be placed in debugfs with:
+
+ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
+ struct dentry *parent, u32 *value);
+
+A read on the resulting file will yield either Y (for non-zero values) or
+N, followed by a newline. If written to, it will accept either upper- or
+lower-case values, or 1 or 0. Any other input will be silently ignored.
+
+Another option is exporting a block of arbitrary binary data, with
+this structure and function:
+
+ struct debugfs_blob_wrapper {
+ void *data;
+ unsigned long size;
+ };
+
+ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
+ struct dentry *parent,
+ struct debugfs_blob_wrapper *blob);
+
+A read of this file will return the data pointed to by the
+debugfs_blob_wrapper structure. Some drivers use "blobs" as a simple way
+to return several lines of (static) formatted text output. This function
+can be used to export binary information, but there does not appear to be
+any code which does so in the mainline. Note that all files created with
+debugfs_create_blob() are read-only.
+
+If you want to dump a block of registers (something that happens quite
+often during development, even if little such code reaches mainline.
+Debugfs offers two functions: one to make a registers-only file, and
+another to insert a register block in the middle of another sequential
+file.
+
+ struct debugfs_reg32 {
+ char *name;
+ unsigned long offset;
+ };
+
+ struct debugfs_regset32 {
+ struct debugfs_reg32 *regs;
+ int nregs;
+ void __iomem *base;
+ };
+
+ struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ struct dentry *parent,
+ struct debugfs_regset32 *regset);
+
+ int debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
+ int nregs, void __iomem *base, char *prefix);
+
+The "base" argument may be 0, but you may want to build the reg32 array
+using __stringify, and a number of register names (macros) are actually
+byte offsets over a base for the register block.
+
+
+There are a couple of other directory-oriented helper functions:
+
+ struct dentry *debugfs_rename(struct dentry *old_dir,
+ struct dentry *old_dentry,
+ struct dentry *new_dir,
+ const char *new_name);
+
+ struct dentry *debugfs_create_symlink(const char *name,
+ struct dentry *parent,
+ const char *target);
+
+A call to debugfs_rename() will give a new name to an existing debugfs
+file, possibly in a different directory. The new_name must not exist prior
+to the call; the return value is old_dentry with updated information.
+Symbolic links can be created with debugfs_create_symlink().
+
+There is one important thing that all debugfs users must take into account:
+there is no automatic cleanup of any directories created in debugfs. If a
+module is unloaded without explicitly removing debugfs entries, the result
+will be a lot of stale pointers and no end of highly antisocial behavior.
+So all debugfs users - at least those which can be built as modules - must
+be prepared to remove all files and directories they create there. A file
+can be removed with:
+
+ void debugfs_remove(struct dentry *dentry);
+
+The dentry value can be NULL, in which case nothing will be removed.
+
+Once upon a time, debugfs users were required to remember the dentry
+pointer for every debugfs file they created so that all files could be
+cleaned up. We live in more civilized times now, though, and debugfs users
+can call:
+
+ void debugfs_remove_recursive(struct dentry *dentry);
+
+If this function is passed a pointer for the dentry corresponding to the
+top-level directory, the entire hierarchy below that directory will be
+removed.
+
+Notes:
+ [1] http://lwn.net/Articles/309298/
diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
new file mode 100644
index 00000000000..68dffd87f9b
--- /dev/null
+++ b/Documentation/filesystems/devpts.txt
@@ -0,0 +1,132 @@
+
+To support containers, we now allow multiple instances of devpts filesystem,
+such that indices of ptys allocated in one instance are independent of indices
+allocated in other instances of devpts.
+
+To preserve backward compatibility, this support for multiple instances is
+enabled only if:
+
+ - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and
+ - '-o newinstance' mount option is specified while mounting devpts
+
+IOW, devpts now supports both single-instance and multi-instance semantics.
+
+If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and
+this referred to as the "legacy" mode. In this mode, the new mount options
+(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message
+on console.
+
+If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the
+'newinstance' option (as in current start-up scripts) the new mount binds
+to the initial kernel mount of devpts. This mode is referred to as the
+'single-instance' mode and the current, single-instance semantics are
+preserved, i.e PTYs are common across the system.
+
+The only difference between this single-instance mode and the legacy mode
+is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which
+can safely be ignored.
+
+If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified,
+the mount is considered to be in the multi-instance mode and a new instance
+of the devpts fs is created. Any ptys created in this instance are independent
+of ptys in other instances of devpts. Like in the single-instance mode, the
+/dev/pts/ptmx node is present. To effectively use the multi-instance mode,
+open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or
+bind-mount.
+
+Eg: A container startup script could do the following:
+
+ $ chmod 0666 /dev/pts/ptmx
+ $ rm /dev/ptmx
+ $ ln -s pts/ptmx /dev/ptmx
+ $ ns_exec -cm /bin/bash
+
+ # We are now in new container
+
+ $ umount /dev/pts
+ $ mount -t devpts -o newinstance lxcpts /dev/pts
+ $ sshd -p 1234
+
+where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs
+/bin/bash in the child process. A pty created by the sshd is not visible in
+the original mount of /dev/pts.
+
+User-space changes
+------------------
+
+In multi-instance mode (i.e '-o newinstance' mount option is specified at least
+once), following user-space issues should be noted.
+
+1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored
+ and no change is needed to system-startup scripts.
+
+2. To effectively use multi-instance mode (i.e -o newinstance is specified)
+ administrators or startup scripts should "redirect" open of /dev/ptmx to
+ /dev/pts/ptmx using either a bind mount or symlink.
+
+ $ mount -t devpts -o newinstance devpts /dev/pts
+
+ followed by either
+
+ $ rm /dev/ptmx
+ $ ln -s pts/ptmx /dev/ptmx
+ $ chmod 666 /dev/pts/ptmx
+ or
+ $ mount -o bind /dev/pts/ptmx /dev/ptmx
+
+3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it
+ enables better error-reporting and treats both single-instance and
+ multi-instance mounts similarly.
+
+ But this method requires that system-startup scripts set the mode of
+ /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the
+ mode by, either
+
+ - adding ptmxmode mount option to devpts entry in /etc/fstab, or
+ - using 'chmod 0666 /dev/pts/ptmx'
+
+4. If multi-instance mode mount is needed for containers, but the system
+ startup scripts have not yet been updated, container-startup scripts
+ should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single-
+ instance mounts.
+
+ Or, in general, container-startup scripts should use:
+
+ mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts
+ if [ ! -L /dev/ptmx ]; then
+ mount -o bind /dev/pts/ptmx /dev/ptmx
+ fi
+
+ When all devpts mounts are multi-instance, /dev/ptmx can permanently be
+ a symlink to pts/ptmx and the bind mount can be ignored.
+
+5. A multi-instance mount that is not accompanied by the /dev/ptmx to
+ /dev/pts/ptmx redirection would result in an unusable/unreachable pty.
+
+ mount -t devpts -o newinstance lxcpts /dev/pts
+
+ immediately followed by:
+
+ open("/dev/ptmx")
+
+ would create a pty, say /dev/pts/7, in the initial kernel mount.
+ But /dev/pts/7 would be invisible in the new mount.
+
+6. The permissions for /dev/pts/ptmx node should be specified when mounting
+ /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000).
+
+ mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts
+
+ The permissions can be later be changed as usual with 'chmod'.
+
+ chmod 666 /dev/pts/ptmx
+
+7. A mount of devpts without the 'newinstance' option results in binding to
+ initial kernel mount. This behavior while preserving legacy semantics,
+ does not provide strict isolation in a container environment. i.e by
+ mounting devpts without the 'newinstance' option, a container could
+ get visibility into the 'host' or root container's devpts.
+
+ To workaround this and have strict isolation, all mounts of devpts,
+ including the mount in the root container, should use the newinstance
+ option.
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
new file mode 100644
index 00000000000..ff7b611abf3
--- /dev/null
+++ b/Documentation/filesystems/directory-locking
@@ -0,0 +1,114 @@
+ Locking scheme used for directory operations is based on two
+kinds of locks - per-inode (->i_mutex) and per-filesystem
+(->s_vfs_rename_mutex).
+
+ For our purposes all operations fall in 5 classes:
+
+1) read access. Locking rules: caller locks directory we are accessing.
+
+2) object creation. Locking rules: same as above.
+
+3) object removal. Locking rules: caller locks parent, finds victim,
+locks victim and calls the method.
+
+4) rename() that is _not_ cross-directory. Locking rules: caller locks
+the parent, finds source and target, if target already exists - locks it
+and then calls the method.
+
+5) link creation. Locking rules:
+ * lock parent
+ * check that source is not a directory
+ * lock source
+ * call the method.
+
+6) cross-directory rename. The trickiest in the whole bunch. Locking
+rules:
+ * lock the filesystem
+ * lock parents in "ancestors first" order.
+ * find source and target.
+ * if old parent is equal to or is a descendent of target
+ fail with -ENOTEMPTY
+ * if new parent is equal to or is a descendent of source
+ fail with -ELOOP
+ * if target exists - lock it.
+ * call the method.
+
+
+The rules above obviously guarantee that all directories that are going to be
+read, modified or removed by method will be locked by caller.
+
+
+If no directory is its own ancestor, the scheme above is deadlock-free.
+Proof:
+
+ First of all, at any moment we have a partial ordering of the
+objects - A < B iff A is an ancestor of B.
+
+ That ordering can change. However, the following is true:
+
+(1) if object removal or non-cross-directory rename holds lock on A and
+ attempts to acquire lock on B, A will remain the parent of B until we
+ acquire the lock on B. (Proof: only cross-directory rename can change
+ the parent of object and it would have to lock the parent).
+
+(2) if cross-directory rename holds the lock on filesystem, order will not
+ change until rename acquires all locks. (Proof: other cross-directory
+ renames will be blocked on filesystem lock and we don't start changing
+ the order until we had acquired all locks).
+
+(3) any operation holds at most one lock on non-directory object and
+ that lock is acquired after all other locks. (Proof: see descriptions
+ of operations).
+
+ Now consider the minimal deadlock. Each process is blocked on
+attempt to acquire some lock and already holds at least one lock. Let's
+consider the set of contended locks. First of all, filesystem lock is
+not contended, since any process blocked on it is not holding any locks.
+Thus all processes are blocked on ->i_mutex.
+
+ Non-directory objects are not contended due to (3). Thus link
+creation can't be a part of deadlock - it can't be blocked on source
+and it means that it doesn't hold any locks.
+
+ Any contended object is either held by cross-directory rename or
+has a child that is also contended. Indeed, suppose that it is held by
+operation other than cross-directory rename. Then the lock this operation
+is blocked on belongs to child of that object due to (1).
+
+ It means that one of the operations is cross-directory rename.
+Otherwise the set of contended objects would be infinite - each of them
+would have a contended child and we had assumed that no object is its
+own descendent. Moreover, there is exactly one cross-directory rename
+(see above).
+
+ Consider the object blocking the cross-directory rename. One
+of its descendents is locked by cross-directory rename (otherwise we
+would again have an infinite set of contended objects). But that
+means that cross-directory rename is taking locks out of order. Due
+to (2) the order hadn't changed since we had acquired filesystem lock.
+But locking rules for cross-directory rename guarantee that we do not
+try to acquire lock on descendent before the lock on ancestor.
+Contradiction. I.e. deadlock is impossible. Q.E.D.
+
+
+ These operations are guaranteed to avoid loop creation. Indeed,
+the only operation that could introduce loops is cross-directory rename.
+Since the only new (parent, child) pair added by rename() is (new parent,
+source), such loop would have to contain these objects and the rest of it
+would have to exist before rename(). I.e. at the moment of loop creation
+rename() responsible for that would be holding filesystem lock and new parent
+would have to be equal to or a descendent of source. But that means that
+new parent had been equal to or a descendent of source since the moment when
+we had acquired filesystem lock and rename() would fail with -ELOOP in that
+case.
+
+ While this locking scheme works for arbitrary DAGs, it relies on
+ability to check that directory is a descendent of another object. Current
+implementation assumes that directory graph is a tree. This assumption is
+also preserved by all operations (cross-directory rename on a tree that would
+not introduce a cycle will leave it a tree and link() fails for directories).
+
+ Notice that "directory" in the above == "anything that might have
+children", so if we are going to introduce hybrid objects we will need
+either to make sure that link(2) doesn't work for them or to make changes
+in is_subdir() that would make it work even in presence of such beasts.
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 00000000000..1b528b2ad80
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page: http://oss.oracle.com/projects/ocfs2
+Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+ DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find its heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exclusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory. Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag Lock Request Type
+--------- -----------------
+O_RDONLY Shared Read
+O_RDWR Exclusive
+
+Open Flag Resulting Locking Behavior
+--------- --------------------------
+O_NONBLOCK Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
new file mode 100644
index 00000000000..6baf88f4685
--- /dev/null
+++ b/Documentation/filesystems/dnotify.txt
@@ -0,0 +1,70 @@
+ Linux Directory Notification
+ ============================
+
+ Stephen Rothwell <sfr@canb.auug.org.au>
+
+The intention of directory notification is to allow user applications
+to be notified when a directory, or any of the files in it, are changed.
+The basic mechanism involves the application registering for notification
+on a directory using a fcntl(2) call and the notifications themselves
+being delivered using signals.
+
+The application decides which "events" it wants to be notified about.
+The currently defined events are:
+
+ DN_ACCESS A file in the directory was accessed (read)
+ DN_MODIFY A file in the directory was modified (write,truncate)
+ DN_CREATE A file was created in the directory
+ DN_DELETE A file was unlinked from directory
+ DN_RENAME A file in the directory was renamed
+ DN_ATTRIB A file in the directory had its attributes
+ changed (chmod,chown)
+
+Usually, the application must reregister after each notification, but
+if DN_MULTISHOT is or'ed with the event mask, then the registration will
+remain until explicitly removed (by registering for no events).
+
+By default, SIGIO will be delivered to the process and no other useful
+information. However, if the F_SETSIG fcntl(2) call is used to let the
+kernel know which signal to deliver, a siginfo structure will be passed to
+the signal handler and the si_fd member of that structure will contain the
+file descriptor associated with the directory in which the event occurred.
+
+Preferably the application will choose one of the real time signals
+(SIGRTMIN + <n>) so that the notifications may be queued. This is
+especially important if DN_MULTISHOT is specified. Note that SIGRTMIN
+is often blocked, so it is better to use (at least) SIGRTMIN + 1.
+
+Implementation expectations (features and bugs :-))
+---------------------------
+
+The notification should work for any local access to files even if the
+actual file system is on a remote server. This implies that remote
+access to files served by local user mode servers should be notified.
+Also, remote accesses to files served by a local kernel NFS server should
+be notified.
+
+In order to make the impact on the file system code as small as possible,
+the problem of hard links to files has been ignored. So if a file (x)
+exists in two directories (a and b) then a change to the file using the
+name "a/x" should be notified to a program expecting notifications on
+directory "a", but will not be notified to one expecting notifications on
+directory "b".
+
+Also, files that are unlinked, will still cause notifications in the
+last directory that they were linked to.
+
+Configuration
+-------------
+
+Dnotify is controlled via the CONFIG_DNOTIFY configuration option. When
+disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
+
+Example
+-------
+See Documentation/filesystems/dnotify_test.c for an example.
+
+NOTE
+----
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
+See Documentation/filesystems/inotify.txt for more information on it.
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 00000000000..8b37b4a1e18
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
+#define _GNU_SOURCE /* needed to get the defines */
+#include <fcntl.h> /* in glibc 2.2 this has the needed
+ values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+ event_fd = si->si_fd;
+}
+
+int main(void)
+{
+ struct sigaction act;
+ int fd;
+
+ act.sa_sigaction = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ sigaction(SIGRTMIN + 1, &act, NULL);
+
+ fd = open(".", O_RDONLY);
+ fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+ fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+ /* we will now be notified if any of the files
+ in "." is modified or new files are created */
+ while (1) {
+ pause();
+ printf("Got event on fd=%d\n", event_fd);
+ }
+}
diff --git a/Documentation/filesystems/ecryptfs.txt b/Documentation/filesystems/ecryptfs.txt
new file mode 100644
index 00000000000..01d8a08351a
--- /dev/null
+++ b/Documentation/filesystems/ecryptfs.txt
@@ -0,0 +1,77 @@
+eCryptfs: A stacked cryptographic filesystem for Linux
+
+eCryptfs is free software. Please see the file COPYING for details.
+For documentation, please see the files in the doc/ subdirectory. For
+building and installation instructions please see the INSTALL file.
+
+Maintainer: Phillip Hellewell
+Lead developer: Michael A. Halcrow <mhalcrow@us.ibm.com>
+Developers: Michael C. Thompson
+ Kent Yoder
+Web Site: http://ecryptfs.sf.net
+
+This software is currently undergoing development. Make sure to
+maintain a backup copy of any data you write into eCryptfs.
+
+eCryptfs requires the userspace tools downloadable from the
+SourceForge site:
+
+http://sourceforge.net/projects/ecryptfs/
+
+Userspace requirements include:
+ - David Howells' userspace keyring headers and libraries (version
+ 1.0 or higher), obtainable from
+ http://people.redhat.com/~dhowells/keyutils/
+ - Libgcrypt
+
+
+NOTES
+
+In the beta/experimental releases of eCryptfs, when you upgrade
+eCryptfs, you should copy the files to an unencrypted location and
+then copy the files back into the new eCryptfs mount to migrate the
+files.
+
+
+MOUNT-WIDE PASSPHRASE
+
+Create a new directory into which eCryptfs will write its encrypted
+files (i.e., /root/crypt). Then, create the mount point directory
+(i.e., /mnt/crypt). Now it's time to mount eCryptfs:
+
+mount -t ecryptfs /root/crypt /mnt/crypt
+
+You should be prompted for a passphrase and a salt (the salt may be
+blank).
+
+Try writing a new file:
+
+echo "Hello, World" > /mnt/crypt/hello.txt
+
+The operation will complete. Notice that there is a new file in
+/root/crypt that is at least 12288 bytes in size (depending on your
+host page size). This is the encrypted underlying file for what you
+just wrote. To test reading, from start to finish, you need to clear
+the user session keyring:
+
+keyctl clear @u
+
+Then umount /mnt/crypt and mount again per the instructions given
+above.
+
+cat /mnt/crypt/hello.txt
+
+
+NOTES
+
+eCryptfs version 0.1 should only be mounted on (1) empty directories
+or (2) directories containing files only created by eCryptfs. If you
+mount a directory that has pre-existing files not created by eCryptfs,
+then behavior is undefined. Do not run eCryptfs in higher verbosity
+levels unless you are doing so for the sole purpose of debugging or
+development, since secret values will be written out to the system log
+in that case.
+
+
+Mike Halcrow
+mhalcrow@us.ibm.com
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644
index 00000000000..23583a13697
--- /dev/null
+++ b/Documentation/filesystems/exofs.txt
@@ -0,0 +1,185 @@
+===============================================================================
+WHAT IS EXOFS?
+===============================================================================
+
+exofs is a file system that uses an OSD and exports the API of a normal Linux
+file system. Users access exofs like any other local file system, and exofs
+will in turn issue commands to the local OSD initiator.
+
+OSD is a new T10 command set that views storage devices not as a large/flat
+array of sectors but as a container of objects, each having a length, quota,
+time attributes and more. Each object is addressed by a 64bit ID, and is
+contained in a 64bit ID partition. Each object has associated attributes
+attached to it, which are integral part of the object and provide metadata about
+the object. The standard defines some common obligatory attributes, but user
+attributes can be added as needed.
+
+===============================================================================
+ENVIRONMENT
+===============================================================================
+
+To use this file system, you need to have an object store to run it on. You
+may download a target from:
+http://open-osd.org
+
+See Documentation/scsi/osd.txt for how to setup a working osd environment.
+
+===============================================================================
+USAGE
+===============================================================================
+
+1. Download and compile exofs and open-osd initiator:
+ You need an external Kernel source tree or kernel headers from your
+ distribution. (anything based on 2.6.26 or later).
+
+ a. download open-osd including exofs source using:
+ [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
+
+ b. Build the library module like this:
+ [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
+
+ This will build both the open-osd initiator as well as the exofs kernel
+ module. Use whatever parameters you compiled your Kernel with and
+ $(KER_DIR) above pointing to the Kernel you compile against. See the file
+ open-osd/top-level-Makefile for an example.
+
+2. Get the OSD initiator and target set up properly, and login to the target.
+ See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
+ for example script that does all these steps.
+
+3. Insmod the exofs.ko module:
+ [exofs]$ insmod exofs.ko
+
+4. Make sure the directory where you want to mount exists. If not, create it.
+ (For example, mkdir /mnt/exofs)
+
+5. At first run you will need to invoke the mkfs.exofs application
+
+ As an example, this will create the file system on:
+ /dev/osd0 partition ID 65536
+
+ mkfs.exofs --pid=65536 --format /dev/osd0
+
+ The --format is optional. If not specified, no OSD_FORMAT will be
+ performed and a clean file system will be created in the specified pid,
+ in the available space of the target. (Use --format=size_in_meg to limit
+ the total LUN space available)
+
+ If pid already exists, it will be deleted and a new one will be created in
+ its place. Be careful.
+
+ An exofs lives inside a single OSD partition. You can create multiple exofs
+ filesystems on the same device using multiple pids.
+
+ (run mkfs.exofs without any parameters for usage help message)
+
+6. Mount the file system.
+
+ For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
+
+ mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
+
+7. For reference (See do-exofs example script):
+ do-exofs start - an example of how to perform the above steps.
+ do-exofs stop - an example of how to unmount the file system.
+ do-exofs format - an example of how to format and mkfs a new exofs.
+
+8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
+ CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
+
+===============================================================================
+exofs mount options
+===============================================================================
+Similar to any mount command:
+ mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
+
+Where:
+ -t exofs: specifies the exofs file system
+
+ /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
+ login into an OSD target.
+
+ mount_exofs_directory: The directory to mount the file system on
+
+ exofs specific options: Options are separated by commas (,)
+ pid=<integer> - The partition number to mount/create as
+ container of the filesystem.
+ This option is mandatory. integer can be
+ Hex by pre-pending an 0x to the number.
+ osdname=<id> - Mount by a device's osdname.
+ osdname is usually a 36 character uuid of the
+ form "d2683732-c906-4ee1-9dbd-c10c27bb40df".
+ It is one of the device's uuid specified in the
+ mkfs.exofs format command.
+ If this option is specified then the /dev/osdX
+ above can be empty and is ignored.
+ to=<integer> - Timeout in ticks for a single command.
+ default is (60 * HZ) [for debugging only]
+
+===============================================================================
+DESIGN
+===============================================================================
+
+* The file system control block (AKA on-disk superblock) resides in an object
+ with a special ID (defined in common.h).
+ Information included in the file system control block is used to fill the
+ in-memory superblock structure at mount time. This object is created before
+ the file system is used by mkexofs.c. It contains information such as:
+ - The file system's magic number
+ - The next inode number to be allocated
+
+* Each file resides in its own object and contains the data (and it will be
+ possible to extend the file over multiple objects, though this has not been
+ implemented yet).
+
+* A directory is treated as a file, and essentially contains a list of <file
+ name, inode #> pairs for files that are found in that directory. The object
+ IDs correspond to the files' inode numbers and will be allocated according to
+ a bitmap (stored in a separate object). Now they are allocated using a
+ counter.
+
+* Each file's control block (AKA on-disk inode) is stored in its object's
+ attributes. This applies to both regular files and other types (directories,
+ device files, symlinks, etc.).
+
+* Credentials are generated per object (inode and superblock) when they are
+ created in memory (read from disk or created). The credential works for all
+ operations and is used as long as the object remains in memory.
+
+* Async OSD operations are used whenever possible, but the target may execute
+ them out of order. The operations that concern us are create, delete,
+ readpage, writepage, update_inode, and truncate. The following pairs of
+ operations should execute in the order written, and we need to prevent them
+ from executing in reverse order:
+ - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
+ flags. OBJ_CREATED is set when we know the object exists on the OSD -
+ in create's callback function, and when we successfully do a
+ read_inode.
+ OBJ_2BCREATED is set in the beginning of the create function, so we
+ know that we should wait.
+ - create/delete: delete should wait until the object is created
+ on the OSD.
+ - create/readpage: readpage should be able to return a page
+ full of zeroes in this case. If there was a write already
+ en-route (i.e. create, writepage, readpage) then the page
+ would be locked, and so it would really be the same as
+ create/writepage.
+ - create/writepage: if writepage is called for a sync write, it
+ should wait until the object is created on the OSD.
+ Otherwise, it should just return.
+ - create/truncate: truncate should wait until the object is
+ created on the OSD.
+ - create/update_inode: update_inode should wait until the
+ object is created on the OSD.
+ - Handled by VFS locks:
+ - readpage/delete: shouldn't happen because of page lock.
+ - writepage/delete: shouldn't happen because of page lock.
+ - readpage/writepage: shouldn't happen because of page lock.
+
+===============================================================================
+LICENSE/COPYRIGHT
+===============================================================================
+The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
+version 2.6.10). All files include the original copyrights, and the license
+is GPL version 2 (only version 2, as is true for the Linux kernel). The
+Linux kernel can be downloaded from www.kernel.org.
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
new file mode 100644
index 00000000000..67639f905f1
--- /dev/null
+++ b/Documentation/filesystems/ext2.txt
@@ -0,0 +1,383 @@
+
+The Second Extended Filesystem
+==============================
+
+ext2 was originally released in January 1993. Written by R\'emy Card,
+Theodore Ts'o and Stephen Tweedie, it was a major rewrite of the
+Extended Filesystem. It is currently still (April 2001) the predominant
+filesystem in use by Linux. There are also implementations available
+for NetBSD, FreeBSD, the GNU HURD, Windows 95/98/NT, OS/2 and RISC OS.
+
+Options
+=======
+
+Most defaults are determined by the filesystem superblock, and can be
+set using tune2fs(8). Kernel-determined defaults are indicated by (*).
+
+bsddf (*) Makes `df' act like BSD.
+minixdf Makes `df' act like Minix.
+
+check=none, nocheck (*) Don't do extra checking of bitmaps on mount
+ (check=normal and check=strict options removed)
+
+debug Extra debugging information is sent to the
+ kernel syslog. Useful for developers.
+
+errors=continue Keep going on a filesystem error.
+errors=remount-ro Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+
+grpid, bsdgroups Give objects the same group ID as their parent.
+nogrpid, sysvgroups New objects have the group ID of their creator.
+
+nouid32 Use 16-bit UIDs and GIDs.
+
+oldalloc Enable the old block allocator. Orlov should
+ have better performance, we'd like to get some
+ feedback if it's the contrary for you.
+orlov (*) Use the Orlov block allocator.
+ (See http://lwn.net/Articles/14633/ and
+ http://lwn.net/Articles/14446/.)
+
+resuid=n The user ID which may use the reserved blocks.
+resgid=n The group ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+user_xattr Enable "user." POSIX Extended Attributes
+ (requires CONFIG_EXT2_FS_XATTR).
+ See also http://acl.bestbits.at
+nouser_xattr Don't support "user." extended attributes.
+
+acl Enable POSIX Access Control Lists support
+ (requires CONFIG_EXT2_FS_POSIX_ACL).
+ See also http://acl.bestbits.at
+noacl Don't support POSIX ACLs.
+
+nobh Do not attach buffer_heads to file pagecache.
+
+xip Use execute in place (no caching) if possible
+
+grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+
+
+Specification
+=============
+
+ext2 shares many properties with traditional Unix filesystems. It has
+the concepts of blocks, inodes and directories. It has space in the
+specification for Access Control Lists (ACLs), fragments, undeletion and
+compression though these are not yet implemented (some are available as
+separate patches). There is also a versioning mechanism to allow new
+features (such as journalling) to be added in a maximally compatible
+manner.
+
+Blocks
+------
+
+The space in the device or file is split up into blocks. These are
+a fixed size, of 1024, 2048 or 4096 bytes (8192 bytes on Alpha systems),
+which is decided when the filesystem is created. Smaller blocks mean
+less wasted space per file, but require slightly more accounting overhead,
+and also impose other limits on the size of files and the filesystem.
+
+Block Groups
+------------
+
+Blocks are clustered into block groups in order to reduce fragmentation
+and minimise the amount of head seeking when reading a large amount
+of consecutive data. Information about each block group is kept in a
+descriptor table stored in the block(s) immediately after the superblock.
+Two blocks near the start of each group are reserved for the block usage
+bitmap and the inode usage bitmap which show which blocks and inodes
+are in use. Since each bitmap is limited to a single block, this means
+that the maximum size of a block group is 8 times the size of a block.
+
+The block(s) following the bitmaps in each block group are designated
+as the inode table for that block group and the remainder are the data
+blocks. The block allocation algorithm attempts to allocate data blocks
+in the same block group as the inode which contains them.
+
+The Superblock
+--------------
+
+The superblock contains all the information about the configuration of
+the filing system. The primary copy of the superblock is stored at an
+offset of 1024 bytes from the start of the device, and it is essential
+to mounting the filesystem. Since it is so important, backup copies of
+the superblock are stored in block groups throughout the filesystem.
+The first version of ext2 (revision 0) stores a copy at the start of
+every block group, along with backups of the group descriptor block(s).
+Because this can consume a considerable amount of space for large
+filesystems, later revisions can optionally reduce the number of backup
+copies by only putting backups in specific groups (this is the sparse
+superblock feature). The groups chosen are 0, 1 and powers of 3, 5 and 7.
+
+The information in the superblock contains fields such as the total
+number of inodes and blocks in the filesystem and how many are free,
+how many inodes and blocks are in each block group, when the filesystem
+was mounted (and if it was cleanly unmounted), when it was modified,
+what version of the filesystem it is (see the Revisions section below)
+and which OS created it.
+
+If the filesystem is revision 1 or higher, then there are extra fields,
+such as a volume name, a unique identification number, the inode size,
+and space for optional filesystem features to store configuration info.
+
+All fields in the superblock (as in all other ext2 structures) are stored
+on the disc in little endian format, so a filesystem is portable between
+machines without having to know what machine it was created on.
+
+Inodes
+------
+
+The inode (index node) is a fundamental concept in the ext2 filesystem.
+Each object in the filesystem is represented by an inode. The inode
+structure contains pointers to the filesystem blocks which contain the
+data held in the object and all of the metadata about an object except
+its name. The metadata about an object includes the permissions, owner,
+group, flags, size, number of blocks used, access time, change time,
+modification time, deletion time, number of links, fragments, version
+(for NFS) and extended attributes (EAs) and/or Access Control Lists (ACLs).
+
+There are some reserved fields which are currently unused in the inode
+structure and several which are overloaded. One field is reserved for the
+directory ACL if the inode is a directory and alternately for the top 32
+bits of the file size if the inode is a regular file (allowing file sizes
+larger than 2GB). The translator field is unused under Linux, but is used
+by the HURD to reference the inode of a program which will be used to
+interpret this object. Most of the remaining reserved fields have been
+used up for both Linux and the HURD for larger owner and group fields,
+The HURD also has a larger mode field so it uses another of the remaining
+fields to store the extra more bits.
+
+There are pointers to the first 12 blocks which contain the file's data
+in the inode. There is a pointer to an indirect block (which contains
+pointers to the next set of blocks), a pointer to a doubly-indirect
+block (which contains pointers to indirect blocks) and a pointer to a
+trebly-indirect block (which contains pointers to doubly-indirect blocks).
+
+The flags field contains some ext2-specific flags which aren't catered
+for by the standard chmod flags. These flags can be listed with lsattr
+and changed with the chattr command, and allow specific filesystem
+behaviour on a per-file basis. There are flags for secure deletion,
+undeletable, compression, synchronous updates, immutability, append-only,
+dumpable, no-atime, indexed directories, and data-journaling. Not all
+of these are supported yet.
+
+Directories
+-----------
+
+A directory is a filesystem object and has an inode just like a file.
+It is a specially formatted file containing records which associate
+each name with an inode number. Later revisions of the filesystem also
+encode the type of the object (file, directory, symlink, device, fifo,
+socket) to avoid the need to check the inode itself for this information
+(support for taking advantage of this feature does not yet exist in
+Glibc 2.2).
+
+The inode allocation code tries to assign inodes which are in the same
+block group as the directory in which they are first created.
+
+The current implementation of ext2 uses a singly-linked list to store
+the filenames in the directory; a pending enhancement uses hashing of the
+filenames to allow lookup without the need to scan the entire directory.
+
+The current implementation never removes empty directory blocks once they
+have been allocated to hold more files.
+
+Special files
+-------------
+
+Symbolic links are also filesystem objects with inodes. They deserve
+special mention because the data for them is stored within the inode
+itself if the symlink is less than 60 bytes long. It uses the fields
+which would normally be used to store the pointers to data blocks.
+This is a worthwhile optimisation as it we avoid allocating a full
+block for the symlink, and most symlinks are less than 60 characters long.
+
+Character and block special devices never have data blocks assigned to
+them. Instead, their device number is stored in the inode, again reusing
+the fields which would be used to point to the data blocks.
+
+Reserved Space
+--------------
+
+In ext2, there is a mechanism for reserving a certain number of blocks
+for a particular user (normally the super-user). This is intended to
+allow for the system to continue functioning even if non-privileged users
+fill up all the space available to them (this is independent of filesystem
+quotas). It also keeps the filesystem from filling up entirely which
+helps combat fragmentation.
+
+Filesystem check
+----------------
+
+At boot time, most systems run a consistency check (e2fsck) on their
+filesystems. The superblock of the ext2 filesystem contains several
+fields which indicate whether fsck should actually run (since checking
+the filesystem at boot can take a long time if it is large). fsck will
+run if the filesystem was not cleanly unmounted, if the maximum mount
+count has been exceeded or if the maximum time between checks has been
+exceeded.
+
+Feature Compatibility
+---------------------
+
+The compatibility feature mechanism used in ext2 is sophisticated.
+It safely allows features to be added to the filesystem, without
+unnecessarily sacrificing compatibility with older versions of the
+filesystem code. The feature compatibility mechanism is not supported by
+the original revision 0 (EXT2_GOOD_OLD_REV) of ext2, but was introduced in
+revision 1. There are three 32-bit fields, one for compatible features
+(COMPAT), one for read-only compatible (RO_COMPAT) features and one for
+incompatible (INCOMPAT) features.
+
+These feature flags have specific meanings for the kernel as follows:
+
+A COMPAT flag indicates that a feature is present in the filesystem,
+but the on-disk format is 100% compatible with older on-disk formats, so
+a kernel which didn't know anything about this feature could read/write
+the filesystem without any chance of corrupting the filesystem (or even
+making it inconsistent). This is essentially just a flag which says
+"this filesystem has a (hidden) feature" that the kernel or e2fsck may
+want to be aware of (more on e2fsck and feature flags later). The ext3
+HAS_JOURNAL feature is a COMPAT flag because the ext3 journal is simply
+a regular file with data blocks in it so the kernel does not need to
+take any special notice of it if it doesn't understand ext3 journaling.
+
+An RO_COMPAT flag indicates that the on-disk format is 100% compatible
+with older on-disk formats for reading (i.e. the feature does not change
+the visible on-disk format). However, an old kernel writing to such a
+filesystem would/could corrupt the filesystem, so this is prevented. The
+most common such feature, SPARSE_SUPER, is an RO_COMPAT feature because
+sparse groups allow file data blocks where superblock/group descriptor
+backups used to live, and ext2_free_blocks() refuses to free these blocks,
+which would leading to inconsistent bitmaps. An old kernel would also
+get an error if it tried to free a series of blocks which crossed a group
+boundary, but this is a legitimate layout in a SPARSE_SUPER filesystem.
+
+An INCOMPAT flag indicates the on-disk format has changed in some
+way that makes it unreadable by older kernels, or would otherwise
+cause a problem if an old kernel tried to mount it. FILETYPE is an
+INCOMPAT flag because older kernels would think a filename was longer
+than 256 characters, which would lead to corrupt directory listings.
+The COMPRESSION flag is an obvious INCOMPAT flag - if the kernel
+doesn't understand compression, you would just get garbage back from
+read() instead of it automatically decompressing your data. The ext3
+RECOVER flag is needed to prevent a kernel which does not understand the
+ext3 journal from mounting the filesystem without replaying the journal.
+
+For e2fsck, it needs to be more strict with the handling of these
+flags than the kernel. If it doesn't understand ANY of the COMPAT,
+RO_COMPAT, or INCOMPAT flags it will refuse to check the filesystem,
+because it has no way of verifying whether a given feature is valid
+or not. Allowing e2fsck to succeed on a filesystem with an unknown
+feature is a false sense of security for the user. Refusing to check
+a filesystem with unknown features is a good incentive for the user to
+update to the latest e2fsck. This also means that anyone adding feature
+flags to ext2 also needs to update e2fsck to verify these features.
+
+Metadata
+--------
+
+It is frequently claimed that the ext2 implementation of writing
+asynchronous metadata is faster than the ffs synchronous metadata
+scheme but less reliable. Both methods are equally resolvable by their
+respective fsck programs.
+
+If you're exceptionally paranoid, there are 3 ways of making metadata
+writes synchronous on ext2:
+
+per-file if you have the program source: use the O_SYNC flag to open()
+per-file if you don't have the source: use "chattr +S" on the file
+per-filesystem: add the "sync" option to mount (or in /etc/fstab)
+
+the first and last are not ext2 specific but do force the metadata to
+be written synchronously. See also Journaling below.
+
+Limitations
+-----------
+
+There are various limits imposed by the on-disk layout of ext2. Other
+limits are imposed by the current implementation of the kernel code.
+Many of the limits are determined at the time the filesystem is first
+created, and depend upon the block size chosen. The ratio of inodes to
+data blocks is fixed at filesystem creation time, so the only way to
+increase the number of inodes is to increase the size of the filesystem.
+No tools currently exist which can change the ratio of inodes to blocks.
+
+Most of these limits could be overcome with slight changes in the on-disk
+format and using a compatibility flag to signal the format change (at
+the expense of some compatibility).
+
+Filesystem block size: 1kB 2kB 4kB 8kB
+
+File size limit: 16GB 256GB 2048GB 2048GB
+Filesystem size limit: 2047GB 8192GB 16384GB 32768GB
+
+There is a 2.4 kernel limit of 2048GB for a single block device, so no
+filesystem larger than that can be created at this time. There is also
+an upper limit on the block size imposed by the page size of the kernel,
+so 8kB blocks are only allowed on Alpha systems (and other architectures
+which support larger pages).
+
+There is an upper limit of 32000 subdirectories in a single directory.
+
+There is a "soft" upper limit of about 10-15k files in a single directory
+with the current linear linked-list directory implementation. This limit
+stems from performance problems when creating and deleting (and also
+finding) files in such large directories. Using a hashed directory index
+(under development) allows 100k-1M+ files in a single directory without
+performance problems (although RAM size becomes an issue at this point).
+
+The (meaningless) absolute upper limit of files in a single directory
+(imposed by the file size, the realistic limit is obviously much less)
+is over 130 trillion files. It would be higher except there are not
+enough 4-character names to make up unique directory entries, so they
+have to be 8 character filenames, even then we are fairly close to
+running out of unique filenames.
+
+Journaling
+----------
+
+A journaling extension to the ext2 code has been developed by Stephen
+Tweedie. It avoids the risks of metadata corruption and the need to
+wait for e2fsck to complete after a crash, without requiring a change
+to the on-disk ext2 layout. In a nutshell, the journal is a regular
+file which stores whole metadata (and optionally data) blocks that have
+been modified, prior to writing them into the filesystem. This means
+it is possible to add a journal to an existing ext2 filesystem without
+the need for data conversion.
+
+When changes to the filesystem (e.g. a file is renamed) they are stored in
+a transaction in the journal and can either be complete or incomplete at
+the time of a crash. If a transaction is complete at the time of a crash
+(or in the normal case where the system does not crash), then any blocks
+in that transaction are guaranteed to represent a valid filesystem state,
+and are copied into the filesystem. If a transaction is incomplete at
+the time of the crash, then there is no guarantee of consistency for
+the blocks in that transaction so they are discarded (which means any
+filesystem changes they represent are also lost).
+Check Documentation/filesystems/ext3.txt if you want to read more about
+ext3 and journaling.
+
+References
+==========
+
+The kernel source file:/usr/src/linux/fs/ext2/
+e2fsprogs (e2fsck) http://e2fsprogs.sourceforge.net/
+Design & Implementation http://e2fsprogs.sourceforge.net/ext2intro.html
+Journaling (ext3) ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/
+Filesystem Resizing http://ext2resize.sourceforge.net/
+Compression (*) http://e2compr.sourceforge.net/
+
+Implementations for:
+Windows 95/98/NT/2000 http://www.chrysocome.net/explore2fs
+Windows 95 (*) http://www.yipton.net/content.html#FSDEXT2
+DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
+OS/2 (+) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/
+RISC OS client http://www.esw-heim.tu-clausthal.de/~marco/smorbrod/IscaFS/
+
+(*) no longer actively developed/supported (as of Apr 2001)
+(+) no longer actively developed/supported (as of Mar 2009)
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
new file mode 100644
index 00000000000..b100adc38ad
--- /dev/null
+++ b/Documentation/filesystems/ext3.txt
@@ -0,0 +1,214 @@
+
+Ext3 Filesystem
+===============
+
+Ext3 was originally released in September 1999. Written by Stephen Tweedie
+for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger,
+Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie.
+
+Ext3 is the ext2 filesystem enhanced with journalling capabilities.
+
+Options
+=======
+
+When mounting an ext3 filesystem, the following option are accepted:
+(*) == default
+
+ro Mount filesystem read only. Note that ext3 will replay
+ the journal (and thus write to the partition) even when
+ mounted "read only". Mount options "ro,noload" can be
+ used to prevent writes to the filesystem.
+
+journal=update Update the ext3 file system's journal to the current
+ format.
+
+journal=inum When a journal already exists, this option is ignored.
+ Otherwise, it specifies the number of the inode which
+ will represent the ext3 file system's journal file.
+
+journal_dev=devnum When the external journal device's major/minor numbers
+ have changed, this option allows the user to specify
+ the new journal location. The journal device is
+ identified through its new major/minor numbers encoded
+ in devnum.
+
+norecovery Don't load the journal on mounting. Note that this forces
+noload mount of inconsistent filesystem, which can lead to
+ various problems.
+
+data=journal All data are committed into the journal prior to being
+ written into the main file system.
+
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+
+commit=nrsec (*) Ext3 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+
+barrier=<0(*)|1> This enables/disables the use of write barriers in
+barrier the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier (*) This also requires an IO stack which can support
+ barriers, and if jbd gets an error on a barrier
+ write, it will disable again with a warning.
+ Write barriers enforce proper on-disk ordering
+ of journal commits, making volatile disk write caches
+ safe to use, at some performance penalty. If
+ your disks are battery-backed in one way or another,
+ disabling barriers may safely improve performance.
+ The mount options "barrier" and "nobarrier" can
+ also be used to enable or disable barriers, for
+ consistency with other ext3 mount options.
+
+user_xattr Enables Extended User Attributes. Additionally, you
+ need to have extended attribute support enabled in the
+ kernel configuration (CONFIG_EXT3_FS_XATTR). See the
+ attr(5) manual page and http://acl.bestbits.at/ to
+ learn more about extended attributes.
+
+nouser_xattr Disables Extended User Attributes.
+
+acl Enables POSIX Access Control Lists support.
+ Additionally, you need to have ACL support enabled in
+ the kernel configuration (CONFIG_EXT3_FS_POSIX_ACL).
+ See the acl(5) manual page and http://acl.bestbits.at/
+ for more information.
+
+noacl This option disables POSIX Access Control List
+ support.
+
+reservation
+
+noreservation
+
+bsddf (*) Make 'df' act like BSD.
+minixdf Make 'df' act like Minix.
+
+check=none Don't do extra checking of bitmaps on mount.
+nocheck
+
+debug Extra debugging information is sent to syslog.
+
+errors=remount-ro Remount the filesystem read-only on an error.
+errors=continue Keep going on a filesystem error.
+errors=panic Panic and halt the machine if an error occurs.
+ (These mount options override the errors behavior
+ specified in the superblock, which can be
+ configured using tune2fs.)
+
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
+grpid Give objects the same group ID as their creator.
+bsdgroups
+
+nogrpid (*) New objects have the group ID of their creator.
+sysvgroups
+
+resgid=n The group ID which may use the reserved blocks.
+
+resuid=n The user ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+quota These options are ignored by the filesystem. They
+noquota are used only by quota tools to recognize volumes
+grpquota where quota should be turned on. See documentation
+usrquota in the quota-tools package for more details
+ (http://sourceforge.net/projects/linuxquota).
+
+jqfmt=<quota type> These options tell filesystem details about quota
+usrjquota=<file> so that quota information can be properly updated
+grpjquota=<file> during journal replay. They replace the above
+ quota options. See documentation in the quota-tools
+ package for more details
+ (http://sourceforge.net/projects/linuxquota).
+
+Specification
+=============
+Ext3 shares all disk implementation with the ext2 filesystem, and adds
+transactions capabilities to ext2. Journaling is done by the Journaling Block
+Device layer.
+
+Journaling Block Device layer
+-----------------------------
+The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
+to add journaling capabilities to a block device. The ext3 filesystem code
+will inform the JBD of modifications it is performing (called a transaction).
+The journal supports the transactions start and stop, and in case of a crash,
+the journal can replay the transactions to quickly put the partition back into
+a consistent state.
+
+Handles represent a single atomic update to a filesystem. JBD can handle an
+external journal on a block device.
+
+Data Mode
+---------
+There are 3 different data modes:
+
+* writeback mode
+In data=writeback mode, ext3 does not journal data at all. This mode provides
+a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+mode - metadata journaling. A crash+recovery can cause incorrect data to
+appear in files which were written shortly before the crash. This mode will
+typically provide the best ext3 performance.
+
+* ordered mode
+In data=ordered mode, ext3 only officially journals metadata, but it logically
+groups metadata and data blocks into a single unit called a transaction. When
+it's time to write the new metadata out to disk, the associated data blocks
+are written first. In general, this mode performs slightly slower than
+writeback but significantly faster than journal mode.
+
+* journal mode
+data=journal mode provides full data and metadata journaling. All new data is
+written to the journal first, and then to its final location.
+In the event of a crash, the journal can be replayed, bringing both data and
+metadata into a consistent state. This mode is the slowest except when data
+needs to be read from and written to disk at the same time where it
+outperforms all other modes.
+
+Compatibility
+-------------
+
+Ext2 partitions can be easily convert to ext3, with `tune2fs -j <dev>`.
+Ext3 is fully compatible with Ext2. Ext3 partitions can easily be mounted as
+Ext2.
+
+
+External Tools
+==============
+See manual pages to learn more.
+
+tune2fs: create a ext3 journal on a ext2 partition with the -j flag.
+mke2fs: create a ext3 partition with the -j flag.
+debugfs: ext2 and ext3 file system debugger.
+ext2online: online (mounted) ext2 and ext3 filesystem resizer
+
+
+References
+==========
+
+kernel source: <file:fs/ext3/>
+ <file:fs/jbd/>
+
+programs: http://e2fsprogs.sourceforge.net/
+ http://ext2resize.sourceforge.net
+
+useful links: http://www.ibm.com/developerworks/library/l-fs7/index.html
+ http://www.ibm.com/developerworks/library/l-fs8/index.html
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
new file mode 100644
index 00000000000..10ec4639f15
--- /dev/null
+++ b/Documentation/filesystems/ext4.txt
@@ -0,0 +1,604 @@
+
+Ext4 Filesystem
+===============
+
+Ext4 is an an advanced level of the ext3 filesystem which incorporates
+scalability and reliability enhancements for supporting large filesystems
+(64 bit) in keeping with increasing disk capacities and state-of-the-art
+feature requirements.
+
+Mailing list: linux-ext4@vger.kernel.org
+Web site: http://ext4.wiki.kernel.org
+
+
+1. Quick usage instructions:
+===========================
+
+Note: More extensive information for getting started with ext4 can be
+ found at the ext4 wiki site at the URL:
+ http://ext4.wiki.kernel.org/index.php/Ext4_Howto
+
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41.3) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
+ ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
+
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Note that it is highly important to install the mke2fs.conf file
+ that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
+ you have edited the /etc/mke2fs.conf file installed on your system,
+ you will need to merge your changes with the version from e2fsprogs
+ 1.41.x.
+
+ - Create a new filesystem using the ext4 filesystem type:
+
+ # mke2fs -t ext4 /dev/hda1
+
+ Or to configure an existing ext3 filesystem to support extents:
+
+ # tune2fs -O extents /dev/hda1
+
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
+
+ # tune2fs -I 256 /dev/hda1
+
+ (Note: we currently do not have tools to convert an ext4
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
+
+ - Mounting:
+
+ # mount -t ext4 /dev/hda1 /wherever
+
+ - When comparing performance with other filesystems, it's always
+ important to try multiple workloads; very often a subtle change in a
+ workload parameter can completely change the ranking of which
+ filesystems do well compared to others. When comparing versus ext3,
+ note that ext4 enables write barriers by default, while ext3 does
+ not enable write barriers by default. So it is useful to use
+ explicitly specify whether barriers are enabled or not when via the
+ '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
+ for a fair comparison. When tuning ext3 for best benchmark numbers,
+ it is often worthwhile to try changing the data journaling mode; '-o
+ data=writeback' can be faster for some workloads. (Note however that
+ running mounted with data=writeback can potentially leave stale data
+ exposed in recently written files in case of an unclean shutdown,
+ which could be a security exposure in some situations.) Configuring
+ the filesystem with a large journal can also be helpful for
+ metadata-intensive workloads.
+
+2. Features
+===========
+
+2.1 Currently available
+
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
+* extent format reduces metadata overhead (RAM, IO for access, transactions)
+* extent format more robust in face of on-disk corruption due to magics,
+* internal redundancy in tree
+* improved file allocation (multi-block alloc)
+* lift 32000 subdirectory limit imposed by i_links_count[1]
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficient new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
+
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
+2.2 Candidate features for future inclusion
+
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjunction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
+
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
+
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
+
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
+ - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
+
+3. Options
+==========
+
+When mounting an ext4 filesystem, the following option are accepted:
+(*) == default
+
+ro Mount filesystem read only. Note that ext4 will
+ replay the journal (and thus write to the
+ partition) even when mounted "read only". The
+ mount options "ro,noload" can be used to prevent
+ writes to the filesystem.
+
+journal_checksum Enable checksumming of the journal transactions.
+ This will allow the recovery code in e2fsck and the
+ kernel to detect corruption in the kernel. It is a
+ compatible change and will be ignored by older kernels.
+
+journal_async_commit Commit block can be written to disk without waiting
+ for descriptor blocks. If enabled older kernels cannot
+ mount the device. This will enable 'journal_checksum'
+ internally.
+
+journal=update Update the ext4 file system's journal to the current
+ format.
+
+journal_dev=devnum When the external journal device's major/minor numbers
+ have changed, this option allows the user to specify
+ the new journal location. The journal device is
+ identified through its new major/minor numbers encoded
+ in devnum.
+
+norecovery Don't load the journal on mounting. Note that
+noload if the filesystem was not unmounted cleanly,
+ skipping the journal replay will lead to the
+ filesystem containing inconsistencies that can
+ lead to any number of problems.
+
+data=journal All data are committed into the journal prior to being
+ written into the main file system. Enabling
+ this mode will disable delayed allocation and
+ O_DIRECT support.
+
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+
+commit=nrsec (*) Ext4 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+
+barrier=<0|1(*)> This enables/disables the use of write barriers in
+barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier This also requires an IO stack which can support
+ barriers, and if jbd gets an error on a barrier
+ write, it will disable again with a warning.
+ Write barriers enforce proper on-disk ordering
+ of journal commits, making volatile disk write caches
+ safe to use, at some performance penalty. If
+ your disks are battery-backed in one way or another,
+ disabling barriers may safely improve performance.
+ The mount options "barrier" and "nobarrier" can
+ also be used to enable or disable barriers, for
+ consistency with other ext4 mount options.
+
+inode_readahead_blks=n This tuning parameter controls the maximum
+ number of inode table blocks that ext4's inode
+ table readahead algorithm will pre-read into
+ the buffer cache. The default value is 32 blocks.
+
+nouser_xattr Disables Extended User Attributes. If you have extended
+ attribute support enabled in the kernel configuration
+ (CONFIG_EXT4_FS_XATTR), extended attribute support
+ is enabled by default on mount. See the attr(5) manual
+ page and http://acl.bestbits.at/ for more information
+ about extended attributes.
+
+noacl This option disables POSIX Access Control List
+ support. If ACL support is enabled in the kernel
+ configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
+ enabled by default on mount. See the acl(5) manual
+ page and http://acl.bestbits.at/ for more information
+ about acl.
+
+bsddf (*) Make 'df' act like BSD.
+minixdf Make 'df' act like Minix.
+
+debug Extra debugging information is sent to syslog.
+
+abort Simulate the effects of calling ext4_abort() for
+ debugging purposes. This is normally used while
+ remounting a filesystem which is already mounted.
+
+errors=remount-ro Remount the filesystem read-only on an error.
+errors=continue Keep going on a filesystem error.
+errors=panic Panic and halt the machine if an error occurs.
+ (These mount options override the errors behavior
+ specified in the superblock, which can be configured
+ using tune2fs)
+
+data_err=ignore(*) Just print an error message if an error occurs
+ in a file data buffer in ordered mode.
+data_err=abort Abort the journal if an error occurs in a file
+ data buffer in ordered mode.
+
+grpid Give objects the same group ID as their creator.
+bsdgroups
+
+nogrpid (*) New objects have the group ID of their creator.
+sysvgroups
+
+resgid=n The group ID which may use the reserved blocks.
+
+resuid=n The user ID which may use the reserved blocks.
+
+sb=n Use alternate superblock at this location.
+
+quota These options are ignored by the filesystem. They
+noquota are used only by quota tools to recognize volumes
+grpquota where quota should be turned on. See documentation
+usrquota in the quota-tools package for more details
+ (http://sourceforge.net/projects/linuxquota).
+
+jqfmt=<quota type> These options tell filesystem details about quota
+usrjquota=<file> so that quota information can be properly updated
+grpjquota=<file> during journal replay. They replace the above
+ quota options. See documentation in the quota-tools
+ package for more details
+ (http://sourceforge.net/projects/linuxquota).
+
+stripe=n Number of filesystem blocks that mballoc will try
+ to use for allocation size and alignment. For RAID5/6
+ systems this should be the number of data
+ disks * RAID chunk size in file system blocks.
+
+delalloc (*) Defer block allocation until just before ext4
+ writes out the block(s) in question. This
+ allows ext4 to better allocation decisions
+ more efficiently.
+nodelalloc Disable delayed allocation. Blocks are allocated
+ when the data is copied from userspace to the
+ page cache, either via the write(2) system call
+ or when an mmap'ed page which was previously
+ unallocated is written for the first time.
+
+max_batch_time=usec Maximum amount of time ext4 should wait for
+ additional filesystem operations to be batch
+ together with a synchronous write operation.
+ Since a synchronous write operation is going to
+ force a commit and then a wait for the I/O
+ complete, it doesn't cost much, and can be a
+ huge throughput win, we wait for a small amount
+ of time to see if any other transactions can
+ piggyback on the synchronous write. The
+ algorithm used is designed to automatically tune
+ for the speed of the disk, by measuring the
+ amount of time (on average) that it takes to
+ finish committing a transaction. Call this time
+ the "commit time". If the time that the
+ transaction has been running is less than the
+ commit time, ext4 will try sleeping for the
+ commit time to see if other operations will join
+ the transaction. The commit time is capped by
+ the max_batch_time, which defaults to 15000us
+ (15ms). This optimization can be turned off
+ entirely by setting max_batch_time to 0.
+
+min_batch_time=usec This parameter sets the commit time (as
+ described above) to be at least min_batch_time.
+ It defaults to zero microseconds. Increasing
+ this parameter may improve the throughput of
+ multi-threaded, synchronous workloads on very
+ fast disks, at the cost of increasing latency.
+
+journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
+ highest priorty) which should be used for I/O
+ operations submitted by kjournald2 during a
+ commit operation. This defaults to 3, which is
+ a slightly higher priority than the default I/O
+ priority.
+
+auto_da_alloc(*) Many broken applications don't use fsync() when
+noauto_da_alloc replacing existing files via patterns such as
+ fd = open("foo.new")/write(fd,..)/close(fd)/
+ rename("foo.new", "foo"), or worse yet,
+ fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+ If auto_da_alloc is enabled, ext4 will detect
+ the replace-via-rename and replace-via-truncate
+ patterns and force that any delayed allocation
+ blocks are allocated such that at the next
+ journal commit, in the default data=ordered
+ mode, the data blocks of the new file are forced
+ to disk before the rename() operation is
+ committed. This provides roughly the same level
+ of guarantees as ext3, and avoids the
+ "zero-length" problem that can happen when a
+ system crashes before the delayed allocation
+ blocks are forced to disk.
+
+noinit_itable Do not initialize any uninitialized inode table
+ blocks in the background. This feature may be
+ used by installation CD's so that the install
+ process can complete as quickly as possible; the
+ inode table initialization process would then be
+ deferred until the next time the file system
+ is unmounted.
+
+init_itable=n The lazy itable init code will wait n times the
+ number of milliseconds it took to zero out the
+ previous block group's inode table. This
+ minimizes the impact on the systme performance
+ while file system's inode table is being initialized.
+
+discard Controls whether ext4 should issue discard/TRIM
+nodiscard(*) commands to the underlying block device when
+ blocks are freed. This is useful for SSD devices
+ and sparse/thinly-provisioned LUNs, but it is off
+ by default until sufficient testing has been done.
+
+nouid32 Disables 32-bit UIDs and GIDs. This is for
+ interoperability with older kernels which only
+ store and expect 16-bit values.
+
+resize Allows to resize filesystem to the end of the last
+ existing block group, further resize has to be done
+ with resize2fs either online, or offline. It can be
+ used only with conjunction with remount.
+
+block_validity This options allows to enables/disables the in-kernel
+noblock_validity facility for tracking filesystem metadata blocks
+ within internal data structures. This allows multi-
+ block allocator and other routines to quickly locate
+ extents which might overlap with filesystem metadata
+ blocks. This option is intended for debugging
+ purposes and since it negatively affects the
+ performance, it is off by default.
+
+dioread_lock Controls whether or not ext4 should use the DIO read
+dioread_nolock locking. If the dioread_nolock option is specified
+ ext4 will allocate uninitialized extent before buffer
+ write and convert the extent to initialized after IO
+ completes. This approach allows ext4 code to avoid
+ using inode mutex, which improves scalability on high
+ speed storages. However this does not work with
+ data journaling and dioread_nolock option will be
+ ignored with kernel warning. Note that dioread_nolock
+ code path is only used for extent-based files.
+ Because of the restrictions this options comprises
+ it is off by default (e.g. dioread_lock).
+
+i_version Enable 64-bit inode version support. This option is
+ off by default.
+
+Data Mode
+=========
+There are 3 different data modes:
+
+* writeback mode
+In data=writeback mode, ext4 does not journal data at all. This mode provides
+a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+mode - metadata journaling. A crash+recovery can cause incorrect data to
+appear in files which were written shortly before the crash. This mode will
+typically provide the best ext4 performance.
+
+* ordered mode
+In data=ordered mode, ext4 only officially journals metadata, but it logically
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
+
+* journal mode
+data=journal mode provides full data and metadata journaling. All new data is
+written to the journal first, and then to its final location.
+In the event of a crash, the journal can be replayed, bringing both data and
+metadata into a consistent state. This mode is the slowest except when data
+needs to be read from and written to disk at the same time where it
+outperforms all others modes. Enabling this mode will disable delayed
+allocation and O_DIRECT support.
+
+/proc entries
+=============
+
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4. Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0). The files in each per-device directory are shown
+in table below.
+
+Files in /proc/fs/ext4/<devname>
+..............................................................................
+ File Content
+ mb_groups details of multiblock allocator buddy cache of free blocks
+..............................................................................
+
+/sys entries
+============
+
+Information about mounted ext4 file systems can be found in
+/sys/fs/ext4. Each mounted filesystem will have a directory in
+/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
+/sys/fs/ext4/dm-0). The files in each per-device directory are shown
+in table below.
+
+Files in /sys/fs/ext4/<devname>
+(see also Documentation/ABI/testing/sysfs-fs-ext4)
+..............................................................................
+ File Content
+
+ delayed_allocation_blocks This file is read-only and shows the number of
+ blocks that are dirty in the page cache, but
+ which do not have their location in the
+ filesystem allocated yet.
+
+ inode_goal Tuning parameter which (if non-zero) controls
+ the goal inode used by the inode allocator in
+ preference to all other allocation heuristics.
+ This is intended for debugging use only, and
+ should be 0 on production systems.
+
+ inode_readahead_blks Tuning parameter which controls the maximum
+ number of inode table blocks that ext4's inode
+ table readahead algorithm will pre-read into
+ the buffer cache
+
+ lifetime_write_kbytes This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was created.
+
+ max_writeback_mb_bump The maximum number of megabytes the writeback
+ code will try to write out before move on to
+ another inode.
+
+ mb_group_prealloc The multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if
+ the stripe size is not set in the ext4 superblock
+
+ mb_max_to_scan The maximum number of extents the multiblock
+ allocator will search to find the best extent
+
+ mb_min_to_scan The minimum number of extents the multiblock
+ allocator will search to find the best extent
+
+ mb_order2_req Tuning parameter which controls the minimum size
+ for requests (as a power of 2) where the buddy
+ cache is used
+
+ mb_stats Controls whether the multiblock allocator should
+ collect statistics, which are shown during the
+ unmount. 1 means to collect statistics, 0 means
+ not to collect statistics
+
+ mb_stream_req Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out
+ of a block group specific preallocation pool, so
+ that small files are packed closely together.
+ Each large file will have its blocks allocated
+ out of its own unique preallocation pool.
+
+ session_write_kbytes This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was mounted.
+..............................................................................
+
+Ioctls
+======
+
+There is some Ext4 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all Ext4 specific ioctls are
+shown in the table below.
+
+Table of Ext4 specific ioctls
+..............................................................................
+ Ioctl Description
+ EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
+ The ioctl argument is an integer bitfield, with
+ bit values described in ext4.h. This ioctl is an
+ alias for FS_IOC_GETFLAGS.
+
+ EXT4_IOC_SETFLAGS Set additional attributes associated with inode.
+ The ioctl argument is an integer bitfield, with
+ bit values described in ext4.h. This ioctl is an
+ alias for FS_IOC_SETFLAGS.
+
+ EXT4_IOC_GETVERSION
+ EXT4_IOC_GETVERSION_OLD
+ Get the inode i_generation number stored for
+ each inode. The i_generation number is normally
+ changed only when new inode is created and it is
+ particularly useful for network filesystems. The
+ '_OLD' version of this ioctl is an alias for
+ FS_IOC_GETVERSION.
+
+ EXT4_IOC_SETVERSION
+ EXT4_IOC_SETVERSION_OLD
+ Set the inode i_generation number stored for
+ each inode. The '_OLD' version of this ioctl
+ is an alias for FS_IOC_SETVERSION.
+
+ EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize
+ mount option. It allows to resize filesystem
+ to the end of the last existing block group,
+ further resize has to be done with resize2fs,
+ either online, or offline. The argument points
+ to the unsigned logn number representing the
+ filesystem new block count.
+
+ EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one
+ this ioctl is pointing to) to the donor_fd (the
+ one specified in move_extent structure passed
+ as an argument to this ioctl). Then, exchange
+ inode metadata between orig_fd and donor_fd.
+ This is especially useful for online
+ defragmentation, because the allocator has the
+ opportunity to allocate moved blocks better,
+ ideally into one contiguous extent.
+
+ EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or
+ new group descriptor block. The new group
+ descriptor is described by ext4_new_group_input
+ structure, which is passed as an argument to
+ this ioctl. This is especially useful in
+ conjunction with EXT4_IOC_GROUP_EXTEND,
+ which allows online resize of the filesystem
+ to the end of the last existing block group.
+ Those two ioctls combined is used in userspace
+ online resize tool (e.g. resize2fs).
+
+ EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself.
+ It converts (migrates) ext3 indirect block mapped
+ inode to ext4 extent mapped inode by walking
+ through indirect block mapping of the original
+ inode and converting contiguous block ranges
+ into ext4 extents of the temporary inode. Then,
+ inodes are swapped. This ioctl might help, when
+ migrating from ext3 to ext4 filesystem, however
+ suggestion is to create fresh ext4 filesystem
+ and copy data from the backup. Note, that
+ filesystem has to support extents for this ioctl
+ to work.
+
+ EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be
+ allocated to preserve application-expected ext3
+ behaviour. Note that this will also start
+ triggering a write of the data blocks, but this
+ behaviour may change in the future as it is
+ not necessary and has been done this way only
+ for sake of simplicity.
+
+ EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number
+ of blocks of resized filesystem is passed in via
+ 64 bit integer argument. The kernel allocates
+ bitmaps and inode table, the userspace tool thus
+ just passes the new number of blocks.
+
+..............................................................................
+
+References
+==========
+
+kernel source: <file:fs/ext4/>
+ <file:fs/jbd2/>
+
+programs: http://e2fsprogs.sourceforge.net/
+
+useful links: http://fedoraproject.org/wiki/ext3-devel
+ http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
new file mode 100644
index 00000000000..1b805a0efbb
--- /dev/null
+++ b/Documentation/filesystems/fiemap.txt
@@ -0,0 +1,228 @@
+============
+Fiemap Ioctl
+============
+
+The fiemap ioctl is an efficient method for userspace to get file
+extent mappings. Instead of block-by-block mapping (such as bmap), fiemap
+returns a list of extents.
+
+
+Request Basics
+--------------
+
+A fiemap request is encoded within struct fiemap:
+
+struct fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace cares about (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents; /* number of extents that were
+ * mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+
+fm_start, and fm_length specify the logical range within the file
+which the process would like mappings for. Extents returned mirror
+those on disk - that is, the logical offset of the 1st returned extent
+may start before fm_start, and the range covered by the last returned
+extent may end after fm_length. All offsets and lengths are in bytes.
+
+Certain flags to modify the way in which mappings are looked up can be
+set in fm_flags. If the kernel doesn't understand some particular
+flags, it will return EBADR and the contents of fm_flags will contain
+the set of flags which caused the error. If the kernel is compatible
+with all flags passed, the contents of fm_flags will be unmodified.
+It is up to userspace to determine whether rejection of a particular
+flag is fatal to its operation. This scheme is intended to allow the
+fiemap interface to grow in the future but without losing
+compatibility with old software.
+
+fm_extent_count specifies the number of elements in the fm_extents[] array
+that can be used to return extents. If fm_extent_count is zero, then the
+fm_extents[] array is ignored (no extents will be returned), and the
+fm_mapped_extents count will hold the number of extents needed in
+fm_extents[] to hold the file's current mapping. Note that there is
+nothing to prevent the file from changing between calls to FIEMAP.
+
+The following flags can be set in fm_flags:
+
+* FIEMAP_FLAG_SYNC
+If this flag is set, the kernel will sync the file before mapping extents.
+
+* FIEMAP_FLAG_XATTR
+If this flag is set, the extents returned will describe the inodes
+extended attribute lookup tree, instead of its data tree.
+
+
+Extent Mapping
+--------------
+
+Extent information is returned within the embedded fm_extents array
+which userspace must allocate along with the fiemap structure. The
+number of elements in the fiemap_extents[] array should be passed via
+fm_extent_count. The number of extents mapped by kernel will be
+returned via fm_mapped_extents. If the number of fiemap_extents
+allocated is less than would be required to map the requested range,
+the maximum number of extents that can be mapped in the fm_extent[]
+array will be returned and fm_mapped_extents will be equal to
+fm_extent_count. In that case, the last extent in the array will not
+complete the requested range and will not have the FIEMAP_EXTENT_LAST
+flag set (see the next section on extent flags).
+
+Each extent is described by a single fiemap_extent structure as
+returned in fm_extents.
+
+struct fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent */
+ __u64 fe_length; /* length in bytes for the extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_reserved[3];
+};
+
+All offsets and lengths are in bytes and mirror those on disk. It is valid
+for an extents logical offset to start before the request or its logical
+length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is
+returned, fe_logical, fe_physical, and fe_length will be aligned to the
+block size of the file system. With the exception of extents flagged as
+FIEMAP_EXTENT_MERGED, adjacent extents will not be merged.
+
+The fe_flags field contains flags which describe the extent returned.
+A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in
+the file so that the process making fiemap calls can determine when no
+more extents are available, without having to call the ioctl again.
+
+Some flags are intentionally vague and will always be set in the
+presence of other more specific flags. This way a program looking for
+a general property does not have to know all existing and future flags
+which imply that property.
+
+For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL
+are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking
+for inline or tail-packed data can key on the specific flag. Software
+which simply cares not to try operating on non-aligned extents
+however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to
+worry about all present and future flags which might imply unaligned
+data. Note that the opposite is not true - it would be valid for
+FIEMAP_EXTENT_NOT_ALIGNED to appear alone.
+
+* FIEMAP_EXTENT_LAST
+This is the last extent in the file. A mapping attempt past this
+extent will return nothing.
+
+* FIEMAP_EXTENT_UNKNOWN
+The location of this extent is currently unknown. This may indicate
+the data is stored on an inaccessible volume or that no storage has
+been allocated for the file yet.
+
+* FIEMAP_EXTENT_DELALLOC
+ - This will also set FIEMAP_EXTENT_UNKNOWN.
+Delayed allocation - while there is data for this extent, its
+physical location has not been allocated yet.
+
+* FIEMAP_EXTENT_ENCODED
+This extent does not consist of plain filesystem blocks but is
+encoded (e.g. encrypted or compressed). Reading the data in this
+extent via I/O to the block device will have undefined results.
+
+Note that it is *always* undefined to try to update the data
+in-place by writing to the indicated location without the
+assistance of the filesystem, or to access the data using the
+information returned by the FIEMAP interface while the filesystem
+is mounted. In other words, user applications may only read the
+extent data via I/O to the block device while the filesystem is
+unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is
+clear; user applications must not try reading or writing to the
+filesystem via the block device under any other circumstances.
+
+* FIEMAP_EXTENT_DATA_ENCRYPTED
+ - This will also set FIEMAP_EXTENT_ENCODED
+The data in this extent has been encrypted by the file system.
+
+* FIEMAP_EXTENT_NOT_ALIGNED
+Extent offsets and length are not guaranteed to be block aligned.
+
+* FIEMAP_EXTENT_DATA_INLINE
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is located within a meta data block.
+
+* FIEMAP_EXTENT_DATA_TAIL
+ This will also set FIEMAP_EXTENT_NOT_ALIGNED
+Data is packed into a block with data from other files.
+
+* FIEMAP_EXTENT_UNWRITTEN
+Unwritten extent - the extent is allocated but its data has not been
+initialized. This indicates the extent's data will be all zero if read
+through the filesystem but the contents are undefined if read directly from
+the device.
+
+* FIEMAP_EXTENT_MERGED
+This will be set when a file does not support extents, i.e., it uses a block
+based addressing scheme. Since returning an extent for each block back to
+userspace would be highly inefficient, the kernel will try to merge most
+adjacent blocks into 'extents'.
+
+
+VFS -> File System Implementation
+---------------------------------
+
+File systems wishing to support fiemap must implement a ->fiemap callback on
+their inode_operations structure. The fs ->fiemap call is responsible for
+defining its set of supported fiemap flags, and calling a helper function on
+each discovered extent:
+
+struct inode_operations {
+ ...
+
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
+ u64 len);
+
+->fiemap is passed struct fiemap_extent_info which describes the
+fiemap request:
+
+struct fiemap_extent_info {
+ unsigned int fi_flags; /* Flags as passed from user */
+ unsigned int fi_extents_mapped; /* Number of mapped extents */
+ unsigned int fi_extents_max; /* Size of fiemap_extent array */
+ struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
+};
+
+It is intended that the file system should not need to access any of this
+structure directly.
+
+
+Flag checking should be done at the beginning of the ->fiemap callback via the
+fiemap_check_flags() helper:
+
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
+
+The struct fieinfo should be passed in as received from ioctl_fiemap(). The
+set of fiemap flags which the fs understands should be passed via fs_flags. If
+fiemap_check_flags finds invalid user flags, it will place the bad values in
+fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
+fiemap_check_flags(), it should immediately exit, returning that error back to
+ioctl_fiemap().
+
+
+For each extent in the request range, the file system should call
+the helper function, fiemap_fill_next_extent():
+
+int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical,
+ u64 phys, u64 len, u32 flags, u32 dev);
+
+fiemap_fill_next_extent() will use the passed values to populate the
+next free extent in the fm_extents array. 'General' extent flags will
+automatically be set from specific flags on behalf of the calling file
+system so that the userspace API is not broken.
+
+fiemap_fill_next_extent() returns 0 on success, and 1 when the
+user-supplied fm_extents array is full. If an error is encountered
+while copying the extent to user memory, -EFAULT will be returned.
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
new file mode 100644
index 00000000000..ac2facc50d2
--- /dev/null
+++ b/Documentation/filesystems/files.txt
@@ -0,0 +1,123 @@
+File management in the Linux kernel
+-----------------------------------
+
+This document describes how locking for files (struct file)
+and file descriptor table (struct files) works.
+
+Up until 2.6.12, the file descriptor table has been protected
+with a lock (files->file_lock) and reference count (files->count).
+->file_lock protected accesses to all the file related fields
+of the table. ->count was used for sharing the file descriptor
+table between tasks cloned with CLONE_FILES flag. Typically
+this would be the case for posix threads. As with the common
+refcounting model in the kernel, the last task doing
+a put_files_struct() frees the file descriptor (fd) table.
+The files (struct file) themselves are protected using
+reference count (->f_count).
+
+In the new lock-free model of file descriptor management,
+the reference counting is similar, but the locking is
+based on RCU. The file descriptor table contains multiple
+elements - the fd sets (open_fds and close_on_exec, the
+array of file pointers, the sizes of the sets and the array
+etc.). In order for the updates to appear atomic to
+a lock-free reader, all the elements of the file descriptor
+table are in a separate structure - struct fdtable.
+files_struct contains a pointer to struct fdtable through
+which the actual fd table is accessed. Initially the
+fdtable is embedded in files_struct itself. On a subsequent
+expansion of fdtable, a new fdtable structure is allocated
+and files->fdtab points to the new structure. The fdtable
+structure is freed with RCU and lock-free readers either
+see the old fdtable or the new fdtable making the update
+appear atomic. Here are the locking rules for
+the fdtable structure -
+
+1. All references to the fdtable must be done through
+ the files_fdtable() macro :
+
+ struct fdtable *fdt;
+
+ rcu_read_lock();
+
+ fdt = files_fdtable(files);
+ ....
+ if (n <= fdt->max_fds)
+ ....
+ ...
+ rcu_read_unlock();
+
+ files_fdtable() uses rcu_dereference() macro which takes care of
+ the memory barrier requirements for lock-free dereference.
+ The fdtable pointer must be read within the read-side
+ critical section.
+
+2. Reading of the fdtable as described above must be protected
+ by rcu_read_lock()/rcu_read_unlock().
+
+3. For any update to the fd table, files->file_lock must
+ be held.
+
+4. To look up the file structure given an fd, a reader
+ must use either fcheck() or fcheck_files() APIs. These
+ take care of barrier requirements due to lock-free lookup.
+ An example :
+
+ struct file *file;
+
+ rcu_read_lock();
+ file = fcheck(fd);
+ if (file) {
+ ...
+ }
+ ....
+ rcu_read_unlock();
+
+5. Handling of the file structures is special. Since the look-up
+ of the fd (fget()/fget_light()) are lock-free, it is possible
+ that look-up may race with the last put() operation on the
+ file structure. This is avoided using atomic_long_inc_not_zero()
+ on ->f_count :
+
+ rcu_read_lock();
+ file = fcheck_files(files, fd);
+ if (file) {
+ if (atomic_long_inc_not_zero(&file->f_count))
+ *fput_needed = 1;
+ else
+ /* Didn't get the reference, someone's freed */
+ file = NULL;
+ }
+ rcu_read_unlock();
+ ....
+ return file;
+
+ atomic_long_inc_not_zero() detects if refcounts is already zero or
+ goes to zero during increment. If it does, we fail
+ fget()/fget_light().
+
+6. Since both fdtable and file structures can be looked up
+ lock-free, they must be installed using rcu_assign_pointer()
+ API. If they are looked up lock-free, rcu_dereference()
+ must be used. However it is advisable to use files_fdtable()
+ and fcheck()/fcheck_files() which take care of these issues.
+
+7. While updating, the fdtable pointer must be looked up while
+ holding files->file_lock. If ->file_lock is dropped, then
+ another thread expand the files thereby creating a new
+ fdtable and making the earlier fdtable pointer stale.
+ For example :
+
+ spin_lock(&files->file_lock);
+ fd = locate_fd(files, file, start);
+ if (fd >= 0) {
+ /* locate_fd() may have expanded fdtable, load the ptr */
+ fdt = files_fdtable(files);
+ FD_SET(fd, fdt->open_fds);
+ FD_CLR(fd, fdt->close_on_exec);
+ spin_unlock(&files->file_lock);
+ .....
+
+ Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
+ the fdtable pointer (fdt) must be loaded after locate_fd().
+
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt
new file mode 100644
index 00000000000..13af4a49e7d
--- /dev/null
+++ b/Documentation/filesystems/fuse.txt
@@ -0,0 +1,423 @@
+Definitions
+~~~~~~~~~~~
+
+Userspace filesystem:
+
+ A filesystem in which data and metadata are provided by an ordinary
+ userspace process. The filesystem can be accessed normally through
+ the kernel interface.
+
+Filesystem daemon:
+
+ The process(es) providing the data and metadata of the filesystem.
+
+Non-privileged mount (or user mount):
+
+ A userspace filesystem mounted by a non-privileged (non-root) user.
+ The filesystem daemon is running with the privileges of the mounting
+ user. NOTE: this is not the same as mounts allowed with the "user"
+ option in /etc/fstab, which is not discussed here.
+
+Filesystem connection:
+
+ A connection between the filesystem daemon and the kernel. The
+ connection exists until either the daemon dies, or the filesystem is
+ umounted. Note that detaching (or lazy umounting) the filesystem
+ does _not_ break the connection, in this case it will exist until
+ the last reference to the filesystem is released.
+
+Mount owner:
+
+ The user who does the mounting.
+
+User:
+
+ The user who is performing filesystem operations.
+
+What is FUSE?
+~~~~~~~~~~~~~
+
+FUSE is a userspace filesystem framework. It consists of a kernel
+module (fuse.ko), a userspace library (libfuse.*) and a mount utility
+(fusermount).
+
+One of the most important features of FUSE is allowing secure,
+non-privileged mounts. This opens up new possibilities for the use of
+filesystems. A good example is sshfs: a secure network filesystem
+using the sftp protocol.
+
+The userspace library and utilities are available from the FUSE
+homepage:
+
+ http://fuse.sourceforge.net/
+
+Filesystem type
+~~~~~~~~~~~~~~~
+
+The filesystem type given to mount(2) can be one of the following:
+
+'fuse'
+
+ This is the usual way to mount a FUSE filesystem. The first
+ argument of the mount system call may contain an arbitrary string,
+ which is not interpreted by the kernel.
+
+'fuseblk'
+
+ The filesystem is block device based. The first argument of the
+ mount system call is interpreted as the name of the device.
+
+Mount options
+~~~~~~~~~~~~~
+
+'fd=N'
+
+ The file descriptor to use for communication between the userspace
+ filesystem and the kernel. The file descriptor must have been
+ obtained by opening the FUSE device ('/dev/fuse').
+
+'rootmode=M'
+
+ The file mode of the filesystem's root in octal representation.
+
+'user_id=N'
+
+ The numeric user id of the mount owner.
+
+'group_id=N'
+
+ The numeric group id of the mount owner.
+
+'default_permissions'
+
+ By default FUSE doesn't check file access permissions, the
+ filesystem is free to implement its access policy or leave it to
+ the underlying file access mechanism (e.g. in case of network
+ filesystems). This option enables permission checking, restricting
+ access based on file mode. It is usually useful together with the
+ 'allow_other' mount option.
+
+'allow_other'
+
+ This option overrides the security measure restricting file access
+ to the user mounting the filesystem. This option is by default only
+ allowed to root, but this restriction can be removed with a
+ (userspace) configuration option.
+
+'max_read=N'
+
+ With this option the maximum size of read operations can be set.
+ The default is infinite. Note that the size of read requests is
+ limited anyway to 32 pages (which is 128kbyte on i386).
+
+'blksize=N'
+
+ Set the block size for the filesystem. The default is 512. This
+ option is only valid for 'fuseblk' type mounts.
+
+Control filesystem
+~~~~~~~~~~~~~~~~~~
+
+There's a control filesystem for FUSE, which can be mounted by:
+
+ mount -t fusectl none /sys/fs/fuse/connections
+
+Mounting it under the '/sys/fs/fuse/connections' directory makes it
+backwards compatible with earlier versions.
+
+Under the fuse control filesystem each connection has a directory
+named by a unique number.
+
+For each connection the following files exist within this directory:
+
+ 'waiting'
+
+ The number of requests which are waiting to be transferred to
+ userspace or being processed by the filesystem daemon. If there is
+ no filesystem activity and 'waiting' is non-zero, then the
+ filesystem is hung or deadlocked.
+
+ 'abort'
+
+ Writing anything into this file will abort the filesystem
+ connection. This means that all waiting requests will be aborted an
+ error returned for all aborted and new requests.
+
+Only the owner of the mount may read or write these files.
+
+Interrupting filesystem operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a process issuing a FUSE filesystem request is interrupted, the
+following will happen:
+
+ 1) If the request is not yet sent to userspace AND the signal is
+ fatal (SIGKILL or unhandled fatal signal), then the request is
+ dequeued and returns immediately.
+
+ 2) If the request is not yet sent to userspace AND the signal is not
+ fatal, then an 'interrupted' flag is set for the request. When
+ the request has been successfully transferred to userspace and
+ this flag is set, an INTERRUPT request is queued.
+
+ 3) If the request is already sent to userspace, then an INTERRUPT
+ request is queued.
+
+INTERRUPT requests take precedence over other requests, so the
+userspace filesystem will receive queued INTERRUPTs before any others.
+
+The userspace filesystem may ignore the INTERRUPT requests entirely,
+or may honor them by sending a reply to the _original_ request, with
+the error set to EINTR.
+
+It is also possible that there's a race between processing the
+original request and its INTERRUPT request. There are two possibilities:
+
+ 1) The INTERRUPT request is processed before the original request is
+ processed
+
+ 2) The INTERRUPT request is processed after the original request has
+ been answered
+
+If the filesystem cannot find the original request, it should wait for
+some timeout and/or a number of new requests to arrive, after which it
+should reply to the INTERRUPT request with an EAGAIN error. In case
+1) the INTERRUPT request will be requeued. In case 2) the INTERRUPT
+reply will be ignored.
+
+Aborting a filesystem connection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to get into certain situations where the filesystem is
+not responding. Reasons for this may be:
+
+ a) Broken userspace filesystem implementation
+
+ b) Network connection down
+
+ c) Accidental deadlock
+
+ d) Malicious deadlock
+
+(For more on c) and d) see later sections)
+
+In either of these cases it may be useful to abort the connection to
+the filesystem. There are several ways to do this:
+
+ - Kill the filesystem daemon. Works in case of a) and b)
+
+ - Kill the filesystem daemon and all users of the filesystem. Works
+ in all cases except some malicious deadlocks
+
+ - Use forced umount (umount -f). Works in all cases but only if
+ filesystem is still attached (it hasn't been lazy unmounted)
+
+ - Abort filesystem through the FUSE control filesystem. Most
+ powerful method, always works.
+
+How do non-privileged mounts work?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since the mount() system call is a privileged operation, a helper
+program (fusermount) is needed, which is installed setuid root.
+
+The implication of providing non-privileged mounts is that the mount
+owner must not be able to use this capability to compromise the
+system. Obvious requirements arising from this are:
+
+ A) mount owner should not be able to get elevated privileges with the
+ help of the mounted filesystem
+
+ B) mount owner should not get illegitimate access to information from
+ other users' and the super user's processes
+
+ C) mount owner should not be able to induce undesired behavior in
+ other users' or the super user's processes
+
+How are requirements fulfilled?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ A) The mount owner could gain elevated privileges by either:
+
+ 1) creating a filesystem containing a device file, then opening
+ this device
+
+ 2) creating a filesystem containing a suid or sgid application,
+ then executing this application
+
+ The solution is not to allow opening device files and ignore
+ setuid and setgid bits when executing programs. To ensure this
+ fusermount always adds "nosuid" and "nodev" to the mount options
+ for non-privileged mounts.
+
+ B) If another user is accessing files or directories in the
+ filesystem, the filesystem daemon serving requests can record the
+ exact sequence and timing of operations performed. This
+ information is otherwise inaccessible to the mount owner, so this
+ counts as an information leak.
+
+ The solution to this problem will be presented in point 2) of C).
+
+ C) There are several ways in which the mount owner can induce
+ undesired behavior in other users' processes, such as:
+
+ 1) mounting a filesystem over a file or directory which the mount
+ owner could otherwise not be able to modify (or could only
+ make limited modifications).
+
+ This is solved in fusermount, by checking the access
+ permissions on the mountpoint and only allowing the mount if
+ the mount owner can do unlimited modification (has write
+ access to the mountpoint, and mountpoint is not a "sticky"
+ directory)
+
+ 2) Even if 1) is solved the mount owner can change the behavior
+ of other users' processes.
+
+ i) It can slow down or indefinitely delay the execution of a
+ filesystem operation creating a DoS against the user or the
+ whole system. For example a suid application locking a
+ system file, and then accessing a file on the mount owner's
+ filesystem could be stopped, and thus causing the system
+ file to be locked forever.
+
+ ii) It can present files or directories of unlimited length, or
+ directory structures of unlimited depth, possibly causing a
+ system process to eat up diskspace, memory or other
+ resources, again causing DoS.
+
+ The solution to this as well as B) is not to allow processes
+ to access the filesystem, which could otherwise not be
+ monitored or manipulated by the mount owner. Since if the
+ mount owner can ptrace a process, it can do all of the above
+ without using a FUSE mount, the same criteria as used in
+ ptrace can be used to check if a process is allowed to access
+ the filesystem or not.
+
+ Note that the ptrace check is not strictly necessary to
+ prevent B/2/i, it is enough to check if mount owner has enough
+ privilege to send signal to the process accessing the
+ filesystem, since SIGSTOP can be used to get a similar effect.
+
+I think these limitations are unacceptable?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a sysadmin trusts the users enough, or can ensure through other
+measures, that system processes will never enter non-privileged
+mounts, it can relax the last limitation with a "user_allow_other"
+config option. If this config option is set, the mounting user can
+add the "allow_other" mount option which disables the check for other
+users' processes.
+
+Kernel - userspace interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following diagram shows how a filesystem operation (in this
+example unlink) is performed in FUSE.
+
+NOTE: everything in this description is greatly simplified
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | | >sys_read()
+ | | >fuse_dev_read()
+ | | >request_wait()
+ | | [sleep on fc->waitq]
+ | |
+ | >sys_unlink() |
+ | >fuse_unlink() |
+ | [get request from |
+ | fc->unused_list] |
+ | >request_send() |
+ | [queue req on fc->pending] |
+ | [wake up fc->waitq] | [woken up]
+ | >request_wait_answer() |
+ | [sleep on req->waitq] |
+ | | <request_wait()
+ | | [remove req from fc->pending]
+ | | [copy req to read buffer]
+ | | [add req to fc->processing]
+ | | <fuse_dev_read()
+ | | <sys_read()
+ | |
+ | | [perform unlink]
+ | |
+ | | >sys_write()
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | [woken up] | [wake up req->waitq]
+ | | <fuse_dev_write()
+ | | <sys_write()
+ | <request_wait_answer() |
+ | <request_send() |
+ | [add request to |
+ | fc->unused_list] |
+ | <fuse_unlink() |
+ | <sys_unlink() |
+
+There are a couple of ways in which to deadlock a FUSE filesystem.
+Since we are talking about unprivileged userspace programs,
+something must be done about these.
+
+Scenario 1 - Simple deadlock
+-----------------------------
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | >sys_unlink("/mnt/fuse/file") |
+ | [acquire inode semaphore |
+ | for "file"] |
+ | >fuse_unlink() |
+ | [sleep on req->waitq] |
+ | | <sys_read()
+ | | >sys_unlink("/mnt/fuse/file")
+ | | [acquire inode semaphore
+ | | for "file"]
+ | | *DEADLOCK*
+
+The solution for this is to allow the filesystem to be aborted.
+
+Scenario 2 - Tricky deadlock
+----------------------------
+
+This one needs a carefully crafted filesystem. It's a variation on
+the above, only the call back to the filesystem is not explicit,
+but is caused by a pagefault.
+
+ | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2
+ | |
+ | [fd = open("/mnt/fuse/file")] | [request served normally]
+ | [mmap fd to 'addr'] |
+ | [close fd] | [FLUSH triggers 'magic' flag]
+ | [read a byte from addr] |
+ | >do_page_fault() |
+ | [find or create page] |
+ | [lock page] |
+ | >fuse_readpage() |
+ | [queue READ request] |
+ | [sleep on req->waitq] |
+ | | [read request to buffer]
+ | | [create reply header before addr]
+ | | >sys_write(addr - headerlength)
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | | >do_page_fault()
+ | | [find or create page]
+ | | [lock page]
+ | | * DEADLOCK *
+
+Solution is basically the same as above.
+
+An additional problem is that while the write buffer is being copied
+to the request, the request must not be interrupted/aborted. This is
+because the destination address of the copy may not be valid after the
+request has returned.
+
+This is solved with doing the copy atomically, and allowing abort
+while the page(s) belonging to the write buffer are faulted with
+get_user_pages(). The 'req->locked' flag indicates when the copy is
+taking place, and abort is delayed until this flag is unset.
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
new file mode 100644
index 00000000000..0494f78d87e
--- /dev/null
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -0,0 +1,114 @@
+ Glock internal locking rules
+ ------------------------------
+
+This documents the basic principles of the glock state machine
+internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h)
+has two main (internal) locks:
+
+ 1. A spinlock (gl_spin) which protects the internal state such
+ as gl_state, gl_target and the list of holders (gl_holders)
+ 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other
+ threads from making calls to the DLM, etc. at the same time. If a
+ thread takes this lock, it must then call run_queue (usually via the
+ workqueue) when it releases it in order to ensure any pending tasks
+ are completed.
+
+The gl_holders list contains all the queued lock requests (not
+just the holders) associated with the glock. If there are any
+held locks, then they will be contiguous entries at the head
+of the list. Locks are granted in strictly the order that they
+are queued, except for those marked LM_FLAG_PRIORITY which are
+used only during recovery, and even then only for journal locks.
+
+There are three lock states that users of the glock layer can request,
+namely shared (SH), deferred (DF) and exclusive (EX). Those translate
+to the following DLM lock modes:
+
+Glock mode | DLM lock mode
+------------------------------
+ UN | IV/NL Unlocked (no DLM lock associated with glock) or NL
+ SH | PR (Protected read)
+ DF | CW (Concurrent write)
+ EX | EX (Exclusive)
+
+Thus DF is basically a shared mode which is incompatible with the "normal"
+shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O
+operations. The glocks are basically a lock plus some routines which deal
+with cache management. The following rules apply for the cache:
+
+Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata
+--------------------------------------------------------------------------
+ UN | No | No | No | No
+ SH | Yes | Yes | No | No
+ DF | No | Yes | No | No
+ EX | Yes | Yes | Yes | Yes
+
+These rules are implemented using the various glock operations which
+are defined for each type of glock. Not all types of glocks use
+all the modes. Only inode glocks use the DF mode for example.
+
+Table of glock operations and per type constants:
+
+Field | Purpose
+----------------------------------------------------------------------------
+go_xmote_th | Called before remote state change (e.g. to sync dirty data)
+go_xmote_bh | Called after remote state change (e.g. to refill cache)
+go_inval | Called if remote state change requires invalidating the cache
+go_demote_ok | Returns boolean value of whether its ok to demote a glock
+ | (e.g. checks timeout, and that there is no cached data)
+go_lock | Called for the first local holder of a lock
+go_unlock | Called on the final local unlock of a lock
+go_dump | Called to print content of object for debugfs file, or on
+ | error to dump glock to the log.
+go_type | The type of the glock, LM_TYPE_.....
+go_min_hold_time | The minimum hold time
+
+The minimum hold time for each lock is the time after a remote lock
+grant for which we ignore remote demote requests. This is in order to
+prevent a situation where locks are being bounced around the cluster
+from node to node with none of the nodes making any progress. This
+tends to show up most with shared mmaped files which are being written
+to by multiple nodes. By delaying the demotion in response to a
+remote callback, that gives the userspace program time to make
+some progress before the pages are unmapped.
+
+There is a plan to try and remove the go_lock and go_unlock callbacks
+if possible, in order to try and speed up the fast path though the locking.
+Also, eventually we hope to make the glock "EX" mode locally shared
+such that any local locking will be done with the i_mutex as required
+rather than via the glock.
+
+Locking rules for glock operations:
+
+Operation | GLF_LOCK bit lock held | gl_spin spinlock held
+-----------------------------------------------------------------
+go_xmote_th | Yes | No
+go_xmote_bh | Yes | No
+go_inval | Yes | No
+go_demote_ok | Sometimes | Yes
+go_lock | Yes | No
+go_unlock | Yes | No
+go_dump | Sometimes | Yes
+
+N.B. Operations must not drop either the bit lock or the spinlock
+if its held on entry. go_dump and do_demote_ok must never block.
+Note that go_dump will only be called if the glock's state
+indicates that it is caching uptodate data.
+
+Glock locking order within GFS2:
+
+ 1. i_mutex (if required)
+ 2. Rename glock (for rename only)
+ 3. Inode glock(s)
+ (Parents before children, inodes at "same level" with same parent in
+ lock number order)
+ 4. Rgrp glock(s) (for (de)allocation operations)
+ 5. Transaction glock (via gfs2_trans_begin) for non-read operations
+ 6. Page lock (always last, very important!)
+
+There are two glocks per inode. One deals with access to the inode
+itself (locking order as above), and the other, known as the iopen
+glock is used in conjunction with the i_nlink field in the inode to
+determine the lifetime of the inode in question. Locking of inodes
+is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
+
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt
new file mode 100644
index 00000000000..d8188966929
--- /dev/null
+++ b/Documentation/filesystems/gfs2-uevents.txt
@@ -0,0 +1,100 @@
+ uevents and GFS2
+ ==================
+
+During the lifetime of a GFS2 mount, a number of uevents are generated.
+This document explains what the events are and what they are used
+for (by gfs_controld in gfs2-utils).
+
+A list of GFS2 uevents
+-----------------------
+
+1. ADD
+
+The ADD event occurs at mount time. It will always be the first
+uevent generated by the newly created filesystem. If the mount
+is successful, an ONLINE uevent will follow. If it is not successful
+then a REMOVE uevent will follow.
+
+The ADD uevent has two environment variables: SPECTATOR=[0|1]
+and RDONLY=[0|1] that specify the spectator status (a read-only mount
+with no journal assigned), and read-only (with journal assigned) status
+of the filesystem respectively.
+
+2. ONLINE
+
+The ONLINE uevent is generated after a successful mount or remount. It
+has the same environment variables as the ADD uevent. The ONLINE
+uevent, along with the two environment variables for spectator and
+RDONLY are a relatively recent addition (2.6.32-rc+) and will not
+be generated by older kernels.
+
+3. CHANGE
+
+The CHANGE uevent is used in two places. One is when reporting the
+successful mount of the filesystem by the first node (FIRSTMOUNT=Done).
+This is used as a signal by gfs_controld that it is then ok for other
+nodes in the cluster to mount the filesystem.
+
+The other CHANGE uevent is used to inform of the completion
+of journal recovery for one of the filesystems journals. It has
+two environment variables, JID= which specifies the journal id which
+has just been recovered, and RECOVERY=[Done|Failed] to indicate the
+success (or otherwise) of the operation. These uevents are generated
+for every journal recovered, whether it is during the initial mount
+process or as the result of gfs_controld requesting a specific journal
+recovery via the /sys/fs/gfs2/<fsname>/lock_module/recovery file.
+
+Because the CHANGE uevent was used (in early versions of gfs_controld)
+without checking the environment variables to discover the state, we
+cannot add any more functions to it without running the risk of
+someone using an older version of the user tools and breaking their
+cluster. For this reason the ONLINE uevent was used when adding a new
+uevent for a successful mount or remount.
+
+4. OFFLINE
+
+The OFFLINE uevent is only generated due to filesystem errors and is used
+as part of the "withdraw" mechanism. Currently this doesn't give any
+information about what the error is, which is something that needs to
+be fixed.
+
+5. REMOVE
+
+The REMOVE uevent is generated at the end of an unsuccessful mount
+or at the end of a umount of the filesystem. All REMOVE uevents will
+have been preceded by at least an ADD uevent for the same fileystem,
+and unlike the other uevents is generated automatically by the kernel's
+kobject subsystem.
+
+
+Information common to all GFS2 uevents (uevent environment variables)
+----------------------------------------------------------------------
+
+1. LOCKTABLE=
+
+The LOCKTABLE is a string, as supplied on the mount command
+line (locktable=) or via fstab. It is used as a filesystem label
+as well as providing the information for a lock_dlm mount to be
+able to join the cluster.
+
+2. LOCKPROTO=
+
+The LOCKPROTO is a string, and its value depends on what is set
+on the mount command line, or via fstab. It will be either
+lock_nolock or lock_dlm. In the future other lock managers
+may be supported.
+
+3. JOURNALID=
+
+If a journal is in use by the filesystem (journals are not
+assigned for spectator mounts) then this will give the
+numeric journal id in all GFS2 uevents.
+
+4. UUID=
+
+With recent versions of gfs2-utils, mkfs.gfs2 writes a UUID
+into the filesystem superblock. If it exists, this will
+be included in every uevent relating to the filesystem.
+
+
+
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 00000000000..4cda926628a
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,46 @@
+Global File System
+------------------
+
+http://sources.redhat.com/cluster/wiki/
+
+GFS is a cluster file system. It allows a cluster of computers to
+simultaneously use a block device that is shared between them (with FC,
+iSCSI, NBD, etc). GFS reads and writes to the block device like a local
+file system, but also uses a lock module to allow the computers coordinate
+their I/O so file system consistency is maintained. One of the nifty
+features of GFS is perfect consistency -- changes made to the file system
+on one machine show up immediately on all other machines in the cluster.
+
+GFS uses interchangeable inter-node locking mechanisms, the currently
+supported mechanisms are:
+
+ lock_nolock -- allows gfs to be used as a local file system
+
+ lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
+ The dlm is found at linux/fs/dlm/
+
+Lock_dlm depends on user space cluster management systems found
+at the URL above.
+
+To use gfs as a local file system, no external clustering systems are
+needed, simply:
+
+ $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
+ $ mount -t gfs2 /dev/block_device /dir
+
+If you are using Fedora, you need to install the gfs2-utils package
+and, for lock_dlm, you will also need to install the cman package
+and write a cluster.conf as per the documentation.
+
+GFS2 is not on-disk compatible with previous versions of GFS, but it
+is pretty close.
+
+The following man pages can be found at the URL above:
+ fsck.gfs2 to repair a filesystem
+ gfs2_grow to expand a filesystem online
+ gfs2_jadd to add journals to a filesystem online
+ gfs2_tool to manipulate, examine and tune a filesystem
+ gfs2_quota to examine and change quota values in a filesystem
+ gfs2_convert to convert a gfs filesystem to gfs2 in-place
+ mount.gfs2 to help mount(8) mount a filesystem
+ mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/hfs.txt b/Documentation/filesystems/hfs.txt
new file mode 100644
index 00000000000..d096df6db07
--- /dev/null
+++ b/Documentation/filesystems/hfs.txt
@@ -0,0 +1,82 @@
+Note: This filesystem doesn't have a maintainer.
+
+Macintosh HFS Filesystem for Linux
+==================================
+
+HFS stands for ``Hierarchical File System'' and is the filesystem used
+by the Mac Plus and all later Macintosh models. Earlier Macintosh
+models used MFS (``Macintosh File System''), which is not supported,
+MacOS 8.1 and newer support a filesystem called HFS+ that's similar to
+HFS but is extended in various areas. Use the hfsplus filesystem driver
+to access such filesystems from Linux.
+
+
+Mount options
+=============
+
+When mounting an HFS filesystem, the following options are accepted:
+
+ creator=cccc, type=cccc
+ Specifies the creator/type values as shown by the MacOS finder
+ used for creating new files. Default values: '????'.
+
+ uid=n, gid=n
+ Specifies the user/group that owns all files on the filesystems.
+ Default: user/group id of the mounting process.
+
+ dir_umask=n, file_umask=n, umask=n
+ Specifies the umask used for all files , all directories or all
+ files and directories. Defaults to the umask of the mounting process.
+
+ session=n
+ Select the CDROM session to mount as HFS filesystem. Defaults to
+ leaving that decision to the CDROM driver. This option will fail
+ with anything but a CDROM as underlying devices.
+
+ part=n
+ Select partition number n from the devices. Does only makes
+ sense for CDROMS because they can't be partitioned under Linux.
+ For disk devices the generic partition parsing code does this
+ for us. Defaults to not parsing the partition table at all.
+
+ quiet
+ Ignore invalid mount options instead of complaining.
+
+
+Writing to HFS Filesystems
+==========================
+
+HFS is not a UNIX filesystem, thus it does not have the usual features you'd
+expect:
+
+ o You can't modify the set-uid, set-gid, sticky or executable bits or the uid
+ and gid of files.
+ o You can't create hard- or symlinks, device files, sockets or FIFOs.
+
+HFS does on the other have the concepts of multiple forks per file. These
+non-standard forks are represented as hidden additional files in the normal
+filesystems namespace which is kind of a cludge and makes the semantics for
+the a little strange:
+
+ o You can't create, delete or rename resource forks of files or the
+ Finder's metadata.
+ o They are however created (with default values), deleted and renamed
+ along with the corresponding data fork or directory.
+ o Copying files to a different filesystem will loose those attributes
+ that are essential for MacOS to work.
+
+
+Creating HFS filesystems
+===================================
+
+The hfsutils package from Robert Leslie contains a program called
+hformat that can be used to create HFS filesystem. See
+<http://www.mars.org/home/rob/proj/hfs/> for details.
+
+
+Credits
+=======
+
+The HFS drivers was written by Paul H. Hargrovea (hargrove@sccm.Stanford.EDU).
+Roman Zippel (roman@ardistech.com) rewrote large parts of the code and brought
+in btree routines derived from Brad Boyer's hfsplus driver.
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
new file mode 100644
index 00000000000..af1628a1061
--- /dev/null
+++ b/Documentation/filesystems/hfsplus.txt
@@ -0,0 +1,59 @@
+
+Macintosh HFSPlus Filesystem for Linux
+======================================
+
+HFSPlus is a filesystem first introduced in MacOS 8.1.
+HFSPlus has several extensions to HFS, including 32-bit allocation
+blocks, 255-character unicode filenames, and file sizes of 2^63 bytes.
+
+
+Mount options
+=============
+
+When mounting an HFSPlus filesystem, the following options are accepted:
+
+ creator=cccc, type=cccc
+ Specifies the creator/type values as shown by the MacOS finder
+ used for creating new files. Default values: '????'.
+
+ uid=n, gid=n
+ Specifies the user/group that owns all files on the filesystem
+ that have uninitialized permissions structures.
+ Default: user/group id of the mounting process.
+
+ umask=n
+ Specifies the umask (in octal) used for files and directories
+ that have uninitialized permissions structures.
+ Default: umask of the mounting process.
+
+ session=n
+ Select the CDROM session to mount as HFSPlus filesystem. Defaults to
+ leaving that decision to the CDROM driver. This option will fail
+ with anything but a CDROM as underlying devices.
+
+ part=n
+ Select partition number n from the devices. This option only makes
+ sense for CDROMs because they can't be partitioned under Linux.
+ For disk devices the generic partition parsing code does this
+ for us. Defaults to not parsing the partition table at all.
+
+ decompose
+ Decompose file name characters.
+
+ nodecompose
+ Do not decompose file name characters.
+
+ force
+ Used to force write access to volumes that are marked as journalled
+ or locked. Use at your own risk.
+
+ nls=cccc
+ Encoding to use when presenting file names.
+
+
+References
+==========
+
+kernel source: <file:fs/hfsplus>
+
+Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt
new file mode 100644
index 00000000000..74630bd504f
--- /dev/null
+++ b/Documentation/filesystems/hpfs.txt
@@ -0,0 +1,296 @@
+Read/Write HPFS 2.09
+1998-2004, Mikulas Patocka
+
+email: mikulas@artax.karlin.mff.cuni.cz
+homepage: http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi
+
+CREDITS:
+Chris Smith, 1993, original read-only HPFS, some code and hpfs structures file
+ is taken from it
+Jacques Gelinas, MSDos mmap, Inspired by fs/nfs/mmap.c (Jon Tombs 15 Aug 1993)
+Werner Almesberger, 1992, 1993, MSDos option parser & CR/LF conversion
+
+Mount options
+
+uid=xxx,gid=xxx,umask=xxx (default uid=gid=0 umask=default_system_umask)
+ Set owner/group/mode for files that do not have it specified in extended
+ attributes. Mode is inverted umask - for example umask 027 gives owner
+ all permission, group read permission and anybody else no access. Note
+ that for files mode is anded with 0666. If you want files to have 'x'
+ rights, you must use extended attributes.
+case=lower,asis (default asis)
+ File name lowercasing in readdir.
+conv=binary,text,auto (default binary)
+ CR/LF -> LF conversion, if auto, decision is made according to extension
+ - there is a list of text extensions (I thing it's better to not convert
+ text file than to damage binary file). If you want to change that list,
+ change it in the source. Original readonly HPFS contained some strange
+ heuristic algorithm that I removed. I thing it's danger to let the
+ computer decide whether file is text or binary. For example, DJGPP
+ binaries contain small text message at the beginning and they could be
+ misidentified and damaged under some circumstances.
+check=none,normal,strict (default normal)
+ Check level. Selecting none will cause only little speedup and big
+ danger. I tried to write it so that it won't crash if check=normal on
+ corrupted filesystems. check=strict means many superfluous checks -
+ used for debugging (for example it checks if file is allocated in
+ bitmaps when accessing it).
+errors=continue,remount-ro,panic (default remount-ro)
+ Behaviour when filesystem errors found.
+chkdsk=no,errors,always (default errors)
+ When to mark filesystem dirty so that OS/2 checks it.
+eas=no,ro,rw (default rw)
+ What to do with extended attributes. 'no' - ignore them and use always
+ values specified in uid/gid/mode options. 'ro' - read extended
+ attributes but do not create them. 'rw' - create extended attributes
+ when you use chmod/chown/chgrp/mknod/ln -s on the filesystem.
+timeshift=(-)nnn (default 0)
+ Shifts the time by nnn seconds. For example, if you see under linux
+ one hour more, than under os/2, use timeshift=-3600.
+
+
+File names
+
+As in OS/2, filenames are case insensitive. However, shell thinks that names
+are case sensitive, so for example when you create a file FOO, you can use
+'cat FOO', 'cat Foo', 'cat foo' or 'cat F*' but not 'cat f*'. Note, that you
+also won't be able to compile linux kernel (and maybe other things) on HPFS
+because kernel creates different files with names like bootsect.S and
+bootsect.s. When searching for file thats name has characters >= 128, codepages
+are used - see below.
+OS/2 ignores dots and spaces at the end of file name, so this driver does as
+well. If you create 'a. ...', the file 'a' will be created, but you can still
+access it under names 'a.', 'a..', 'a . . . ' etc.
+
+
+Extended attributes
+
+On HPFS partitions, OS/2 can associate to each file a special information called
+extended attributes. Extended attributes are pairs of (key,value) where key is
+an ascii string identifying that attribute and value is any string of bytes of
+variable length. OS/2 stores window and icon positions and file types there. So
+why not use it for unix-specific info like file owner or access rights? This
+driver can do it. If you chown/chgrp/chmod on a hpfs partition, extended
+attributes with keys "UID", "GID" or "MODE" and 2-byte values are created. Only
+that extended attributes those value differs from defaults specified in mount
+options are created. Once created, the extended attributes are never deleted,
+they're just changed. It means that when your default uid=0 and you type
+something like 'chown luser file; chown root file' the file will contain
+extended attribute UID=0. And when you umount the fs and mount it again with
+uid=luser_uid, the file will be still owned by root! If you chmod file to 444,
+extended attribute "MODE" will not be set, this special case is done by setting
+read-only flag. When you mknod a block or char device, besides "MODE", the
+special 4-byte extended attribute "DEV" will be created containing the device
+number. Currently this driver cannot resize extended attributes - it means
+that if somebody (I don't know who?) has set "UID", "GID", "MODE" or "DEV"
+attributes with different sizes, they won't be rewritten and changing these
+values doesn't work.
+
+
+Symlinks
+
+You can do symlinks on HPFS partition, symlinks are achieved by setting extended
+attribute named "SYMLINK" with symlink value. Like on ext2, you can chown and
+chgrp symlinks but I don't know what is it good for. chmoding symlink results
+in chmoding file where symlink points. These symlinks are just for Linux use and
+incompatible with OS/2. OS/2 PmShell symlinks are not supported because they are
+stored in very crazy way. They tried to do it so that link changes when file is
+moved ... sometimes it works. But the link is partly stored in directory
+extended attributes and partly in OS2SYS.INI. I don't want (and don't know how)
+to analyze or change OS2SYS.INI.
+
+
+Codepages
+
+HPFS can contain several uppercasing tables for several codepages and each
+file has a pointer to codepage its name is in. However OS/2 was created in
+America where people don't care much about codepages and so multiple codepages
+support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
+Once I booted English OS/2 working in cp 850 and I created a file on my 852
+partition. It marked file name codepage as 850 - good. But when I again booted
+Czech OS/2, the file was completely inaccessible under any name. It seems that
+OS/2 uppercases the search pattern with its system code page (852) and file
+name it's comparing to with its code page (850). These could never match. Is it
+really what IBM developers wanted? But problems continued. When I created in
+Czech OS/2 another file in that directory, that file was inaccessible too. OS/2
+probably uses different uppercasing method when searching where to place a file
+(note, that files in HPFS directory must be sorted) and when searching for
+a file. Finally when I opened this directory in PmShell, PmShell crashed (the
+funny thing was that, when rebooted, PmShell tried to reopen this directory
+again :-). chkdsk happily ignores these errors and only low-level disk
+modification saved me. Never mix different language versions of OS/2 on one
+system although HPFS was designed to allow that.
+OK, I could implement complex codepage support to this driver but I think it
+would cause more problems than benefit with such buggy implementation in OS/2.
+So this driver simply uses first codepage it finds for uppercasing and
+lowercasing no matter what's file codepage index. Usually all file names are in
+this codepage - if you don't try to do what I described above :-)
+
+
+Known bugs
+
+HPFS386 on OS/2 server is not supported. HPFS386 installed on normal OS/2 client
+should work. If you have OS/2 server, use only read-only mode. I don't know how
+to handle some HPFS386 structures like access control list or extended perm
+list, I don't know how to delete them when file is deleted and how to not
+overwrite them with extended attributes. Send me some info on these structures
+and I'll make it. However, this driver should detect presence of HPFS386
+structures, remount read-only and not destroy them (I hope).
+
+When there's not enough space for extended attributes, they will be truncated
+and no error is returned.
+
+OS/2 can't access files if the path is longer than about 256 chars but this
+driver allows you to do it. chkdsk ignores such errors.
+
+Sometimes you won't be able to delete some files on a very full filesystem
+(returning error ENOSPC). That's because file in non-leaf node in directory tree
+(one directory, if it's large, has dirents in tree on HPFS) must be replaced
+with another node when deleted. And that new file might have larger name than
+the old one so the new name doesn't fit in directory node (dnode). And that
+would result in directory tree splitting, that takes disk space. Workaround is
+to delete other files that are leaf (probability that the file is non-leaf is
+about 1/50) or to truncate file first to make some space.
+You encounter this problem only if you have many directories so that
+preallocated directory band is full i.e.
+ number_of_directories / size_of_filesystem_in_mb > 4.
+
+You can't delete open directories.
+
+You can't rename over directories (what is it good for?).
+
+Renaming files so that only case changes doesn't work. This driver supports it
+but vfs doesn't. Something like 'mv file FILE' won't work.
+
+All atimes and directory mtimes are not updated. That's because of performance
+reasons. If you extremely wish to update them, let me know, I'll write it (but
+it will be slow).
+
+When the system is out of memory and swap, it may slightly corrupt filesystem
+(lost files, unbalanced directories). (I guess all filesystem may do it).
+
+When compiled, you get warning: function declaration isn't a prototype. Does
+anybody know what does it mean?
+
+
+What does "unbalanced tree" message mean?
+
+Old versions of this driver created sometimes unbalanced dnode trees. OS/2
+chkdsk doesn't scream if the tree is unbalanced (and sometimes creates
+unbalanced trees too :-) but both HPFS and HPFS386 contain bug that it rarely
+crashes when the tree is not balanced. This driver handles unbalanced trees
+correctly and writes warning if it finds them. If you see this message, this is
+probably because of directories created with old version of this driver.
+Workaround is to move all files from that directory to another and then back
+again. Do it in Linux, not OS/2! If you see this message in directory that is
+whole created by this driver, it is BUG - let me know about it.
+
+
+Bugs in OS/2
+
+When you have two (or more) lost directories pointing each to other, chkdsk
+locks up when repairing filesystem.
+
+Sometimes (I think it's random) when you create a file with one-char name under
+OS/2, OS/2 marks it as 'long'. chkdsk then removes this flag saying "Minor fs
+error corrected".
+
+File names like "a .b" are marked as 'long' by OS/2 but chkdsk "corrects" it and
+marks them as short (and writes "minor fs error corrected"). This bug is not in
+HPFS386.
+
+Codepage bugs described above.
+
+If you don't install fixpacks, there are many, many more...
+
+
+History
+
+0.90 First public release
+0.91 Fixed bug that caused shooting to memory when write_inode was called on
+ open inode (rarely happened)
+0.92 Fixed a little memory leak in freeing directory inodes
+0.93 Fixed bug that locked up the machine when there were too many filenames
+ with first 15 characters same
+ Fixed write_file to zero file when writing behind file end
+0.94 Fixed a little memory leak when trying to delete busy file or directory
+0.95 Fixed a bug that i_hpfs_parent_dir was not updated when moving files
+1.90 First version for 2.1.1xx kernels
+1.91 Fixed a bug that chk_sectors failed when sectors were at the end of disk
+ Fixed a race-condition when write_inode is called while deleting file
+ Fixed a bug that could possibly happen (with very low probability) when
+ using 0xff in filenames
+ Rewritten locking to avoid race-conditions
+ Mount option 'eas' now works
+ Fsync no longer returns error
+ Files beginning with '.' are marked hidden
+ Remount support added
+ Alloc is not so slow when filesystem becomes full
+ Atimes are no more updated because it slows down operation
+ Code cleanup (removed all commented debug prints)
+1.92 Corrected a bug when sync was called just before closing file
+1.93 Modified, so that it works with kernels >= 2.1.131, I don't know if it
+ works with previous versions
+ Fixed a possible problem with disks > 64G (but I don't have one, so I can't
+ test it)
+ Fixed a file overflow at 2G
+ Added new option 'timeshift'
+ Changed behaviour on HPFS386: It is now possible to operate on HPFS386 in
+ read-only mode
+ Fixed a bug that slowed down alloc and prevented allocating 100% space
+ (this bug was not destructive)
+1.94 Added workaround for one bug in Linux
+ Fixed one buffer leak
+ Fixed some incompatibilities with large extended attributes (but it's still
+ not 100% ok, I have no info on it and OS/2 doesn't want to create them)
+ Rewritten allocation
+ Fixed a bug with i_blocks (du sometimes didn't display correct values)
+ Directories have no longer archive attribute set (some programs don't like
+ it)
+ Fixed a bug that it set badly one flag in large anode tree (it was not
+ destructive)
+1.95 Fixed one buffer leak, that could happen on corrupted filesystem
+ Fixed one bug in allocation in 1.94
+1.96 Added workaround for one bug in OS/2 (HPFS locked up, HPFS386 reported
+ error sometimes when opening directories in PMSHELL)
+ Fixed a possible bitmap race
+ Fixed possible problem on large disks
+ You can now delete open files
+ Fixed a nondestructive race in rename
+1.97 Support for HPFS v3 (on large partitions)
+ Fixed a bug that it didn't allow creation of files > 128M (it should be 2G)
+1.97.1 Changed names of global symbols
+ Fixed a bug when chmoding or chowning root directory
+1.98 Fixed a deadlock when using old_readdir
+ Better directory handling; workaround for "unbalanced tree" bug in OS/2
+1.99 Corrected a possible problem when there's not enough space while deleting
+ file
+ Now it tries to truncate the file if there's not enough space when deleting
+ Removed a lot of redundant code
+2.00 Fixed a bug in rename (it was there since 1.96)
+ Better anti-fragmentation strategy
+2.01 Fixed problem with directory listing over NFS
+ Directory lseek now checks for proper parameters
+ Fixed race-condition in buffer code - it is in all filesystems in Linux;
+ when reading device (cat /dev/hda) while creating files on it, files
+ could be damaged
+2.02 Workaround for bug in breada in Linux. breada could cause accesses beyond
+ end of partition
+2.03 Char, block devices and pipes are correctly created
+ Fixed non-crashing race in unlink (Alexander Viro)
+ Now it works with Japanese version of OS/2
+2.04 Fixed error when ftruncate used to extend file
+2.05 Fixed crash when got mount parameters without =
+ Fixed crash when allocation of anode failed due to full disk
+ Fixed some crashes when block io or inode allocation failed
+2.06 Fixed some crash on corrupted disk structures
+ Better allocation strategy
+ Reschedule points added so that it doesn't lock CPU long time
+ It should work in read-only mode on Warp Server
+2.07 More fixes for Warp Server. Now it really works
+2.08 Creating new files is not so slow on large disks
+ An attempt to sync deleted file does not generate filesystem error
+2.09 Fixed error on extremely fragmented files
+
+
+ vim: set textwidth=80:
diff --git a/Documentation/filesystems/inotify.txt b/Documentation/filesystems/inotify.txt
new file mode 100644
index 00000000000..cfd02712b83
--- /dev/null
+++ b/Documentation/filesystems/inotify.txt
@@ -0,0 +1,270 @@
+ inotify
+ a powerful yet simple file change notification system
+
+
+
+Document started 15 Mar 2005 by Robert Love <rml@novell.com>
+
+
+(i) User Interface
+
+Inotify is controlled by a set of three system calls and normal file I/O on a
+returned file descriptor.
+
+First step in using inotify is to initialise an inotify instance:
+
+ int fd = inotify_init ();
+
+Each instance is associated with a unique, ordered queue.
+
+Change events are managed by "watches". A watch is an (object,mask) pair where
+the object is a file or directory and the mask is a bit mask of one or more
+inotify events that the application wishes to receive. See <linux/inotify.h>
+for valid events. A watch is referenced by a watch descriptor, or wd.
+
+Watches are added via a path to the file.
+
+Watches on a directory will return events on any files inside of the directory.
+
+Adding a watch is simple:
+
+ int wd = inotify_add_watch (fd, path, mask);
+
+Where "fd" is the return value from inotify_init(), path is the path to the
+object to watch, and mask is the watch mask (see <linux/inotify.h>).
+
+You can update an existing watch in the same manner, by passing in a new mask.
+
+An existing watch is removed via
+
+ int ret = inotify_rm_watch (fd, wd);
+
+Events are provided in the form of an inotify_event structure that is read(2)
+from a given inotify instance. The filename is of dynamic length and follows
+the struct. It is of size len. The filename is padded with null bytes to
+ensure proper alignment. This padding is reflected in len.
+
+You can slurp multiple events by passing a large buffer, for example
+
+ size_t len = read (fd, buf, BUF_LEN);
+
+Where "buf" is a pointer to an array of "inotify_event" structures at least
+BUF_LEN bytes in size. The above example will return as many events as are
+available and fit in BUF_LEN.
+
+Each inotify instance fd is also select()- and poll()-able.
+
+You can find the size of the current event queue via the standard FIONREAD
+ioctl on the fd returned by inotify_init().
+
+All watches are destroyed and cleaned up on close.
+
+
+(ii)
+
+Prototypes:
+
+ int inotify_init (void);
+ int inotify_add_watch (int fd, const char *path, __u32 mask);
+ int inotify_rm_watch (int fd, __u32 mask);
+
+
+(iii) Kernel Interface
+
+Inotify's kernel API consists a set of functions for managing watches and an
+event callback.
+
+To use the kernel API, you must first initialize an inotify instance with a set
+of inotify_operations. You are given an opaque inotify_handle, which you use
+for any further calls to inotify.
+
+ struct inotify_handle *ih = inotify_init(my_event_handler);
+
+You must provide a function for processing events and a function for destroying
+the inotify watch.
+
+ void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+ u32 cookie, const char *name, struct inode *inode)
+
+ watch - the pointer to the inotify_watch that triggered this call
+ wd - the watch descriptor
+ mask - describes the event that occurred
+ cookie - an identifier for synchronizing events
+ name - the dentry name for affected files in a directory-based event
+ inode - the affected inode in a directory-based event
+
+ void destroy_watch(struct inotify_watch *watch)
+
+You may add watches by providing a pre-allocated and initialized inotify_watch
+structure and specifying the inode to watch along with an inotify event mask.
+You must pin the inode during the call. You will likely wish to embed the
+inotify_watch structure in a structure of your own which contains other
+information about the watch. Once you add an inotify watch, it is immediately
+subject to removal depending on filesystem events. You must grab a reference if
+you depend on the watch hanging around after the call.
+
+ inotify_init_watch(&my_watch->iwatch);
+ inotify_get_watch(&my_watch->iwatch); // optional
+ s32 wd = inotify_add_watch(ih, &my_watch->iwatch, inode, mask);
+ inotify_put_watch(&my_watch->iwatch); // optional
+
+You may use the watch descriptor (wd) or the address of the inotify_watch for
+other inotify operations. You must not directly read or manipulate data in the
+inotify_watch. Additionally, you must not call inotify_add_watch() more than
+once for a given inotify_watch structure, unless you have first called either
+inotify_rm_watch() or inotify_rm_wd().
+
+To determine if you have already registered a watch for a given inode, you may
+call inotify_find_watch(), which gives you both the wd and the watch pointer for
+the inotify_watch, or an error if the watch does not exist.
+
+ wd = inotify_find_watch(ih, inode, &watchp);
+
+You may use container_of() on the watch pointer to access your own data
+associated with a given watch. When an existing watch is found,
+inotify_find_watch() bumps the refcount before releasing its locks. You must
+put that reference with:
+
+ put_inotify_watch(watchp);
+
+Call inotify_find_update_watch() to update the event mask for an existing watch.
+inotify_find_update_watch() returns the wd of the updated watch, or an error if
+the watch does not exist.
+
+ wd = inotify_find_update_watch(ih, inode, mask);
+
+An existing watch may be removed by calling either inotify_rm_watch() or
+inotify_rm_wd().
+
+ int ret = inotify_rm_watch(ih, &my_watch->iwatch);
+ int ret = inotify_rm_wd(ih, wd);
+
+A watch may be removed while executing your event handler with the following:
+
+ inotify_remove_watch_locked(ih, iwatch);
+
+Call inotify_destroy() to remove all watches from your inotify instance and
+release it. If there are no outstanding references, inotify_destroy() will call
+your destroy_watch op for each watch.
+
+ inotify_destroy(ih);
+
+When inotify removes a watch, it sends an IN_IGNORED event to your callback.
+You may use this event as an indication to free the watch memory. Note that
+inotify may remove a watch due to filesystem events, as well as by your request.
+If you use IN_ONESHOT, inotify will remove the watch after the first event, at
+which point you may call the final inotify_put_watch.
+
+(iv) Kernel Interface Prototypes
+
+ struct inotify_handle *inotify_init(struct inotify_operations *ops);
+
+ inotify_init_watch(struct inotify_watch *watch);
+
+ s32 inotify_add_watch(struct inotify_handle *ih,
+ struct inotify_watch *watch,
+ struct inode *inode, u32 mask);
+
+ s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+ struct inotify_watch **watchp);
+
+ s32 inotify_find_update_watch(struct inotify_handle *ih,
+ struct inode *inode, u32 mask);
+
+ int inotify_rm_wd(struct inotify_handle *ih, u32 wd);
+
+ int inotify_rm_watch(struct inotify_handle *ih,
+ struct inotify_watch *watch);
+
+ void inotify_remove_watch_locked(struct inotify_handle *ih,
+ struct inotify_watch *watch);
+
+ void inotify_destroy(struct inotify_handle *ih);
+
+ void get_inotify_watch(struct inotify_watch *watch);
+ void put_inotify_watch(struct inotify_watch *watch);
+
+
+(v) Internal Kernel Implementation
+
+Each inotify instance is represented by an inotify_handle structure.
+Inotify's userspace consumers also have an inotify_device which is
+associated with the inotify_handle, and on which events are queued.
+
+Each watch is associated with an inotify_watch structure. Watches are chained
+off of each associated inotify_handle and each associated inode.
+
+See fs/notify/inotify/inotify_fsnotify.c and fs/notify/inotify/inotify_user.c
+for the locking and lifetime rules.
+
+
+(vi) Rationale
+
+Q: What is the design decision behind not tying the watch to the open fd of
+ the watched object?
+
+A: Watches are associated with an open inotify device, not an open file.
+ This solves the primary problem with dnotify: keeping the file open pins
+ the file and thus, worse, pins the mount. Dnotify is therefore infeasible
+ for use on a desktop system with removable media as the media cannot be
+ unmounted. Watching a file should not require that it be open.
+
+Q: What is the design decision behind using an-fd-per-instance as opposed to
+ an fd-per-watch?
+
+A: An fd-per-watch quickly consumes more file descriptors than are allowed,
+ more fd's than are feasible to manage, and more fd's than are optimally
+ select()-able. Yes, root can bump the per-process fd limit and yes, users
+ can use epoll, but requiring both is a silly and extraneous requirement.
+ A watch consumes less memory than an open file, separating the number
+ spaces is thus sensible. The current design is what user-space developers
+ want: Users initialize inotify, once, and add n watches, requiring but one
+ fd and no twiddling with fd limits. Initializing an inotify instance two
+ thousand times is silly. If we can implement user-space's preferences
+ cleanly--and we can, the idr layer makes stuff like this trivial--then we
+ should.
+
+ There are other good arguments. With a single fd, there is a single
+ item to block on, which is mapped to a single queue of events. The single
+ fd returns all watch events and also any potential out-of-band data. If
+ every fd was a separate watch,
+
+ - There would be no way to get event ordering. Events on file foo and
+ file bar would pop poll() on both fd's, but there would be no way to tell
+ which happened first. A single queue trivially gives you ordering. Such
+ ordering is crucial to existing applications such as Beagle. Imagine
+ "mv a b ; mv b a" events without ordering.
+
+ - We'd have to maintain n fd's and n internal queues with state,
+ versus just one. It is a lot messier in the kernel. A single, linear
+ queue is the data structure that makes sense.
+
+ - User-space developers prefer the current API. The Beagle guys, for
+ example, love it. Trust me, I asked. It is not a surprise: Who'd want
+ to manage and block on 1000 fd's via select?
+
+ - No way to get out of band data.
+
+ - 1024 is still too low. ;-)
+
+ When you talk about designing a file change notification system that
+ scales to 1000s of directories, juggling 1000s of fd's just does not seem
+ the right interface. It is too heavy.
+
+ Additionally, it _is_ possible to more than one instance and
+ juggle more than one queue and thus more than one associated fd. There
+ need not be a one-fd-per-process mapping; it is one-fd-per-queue and a
+ process can easily want more than one queue.
+
+Q: Why the system call approach?
+
+A: The poor user-space interface is the second biggest problem with dnotify.
+ Signals are a terrible, terrible interface for file notification. Or for
+ anything, for that matter. The ideal solution, from all perspectives, is a
+ file descriptor-based one that allows basic file I/O and poll/select.
+ Obtaining the fd and managing the watches could have been done either via a
+ device file or a family of new system calls. We decided to implement a
+ family of system calls because that is the preferred approach for new kernel
+ interfaces. The only real difference was whether we wanted to use open(2)
+ and ioctl(2) or a couple of new system calls. System calls beat ioctls.
+
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
new file mode 100644
index 00000000000..ba0a93384de
--- /dev/null
+++ b/Documentation/filesystems/isofs.txt
@@ -0,0 +1,48 @@
+Mount options that are the same as for msdos and vfat partitions.
+
+ gid=nnn All files in the partition will be in group nnn.
+ uid=nnn All files in the partition will be owned by user id nnn.
+ umask=nnn The permission mask (see umask(1)) for the partition.
+
+Mount options that are the same as vfat partitions. These are only useful
+when using discs encoded using Microsoft's Joliet extensions.
+ iocharset=name Character set to use for converting from Unicode to
+ ASCII. Joliet filenames are stored in Unicode format, but
+ Unix for the most part doesn't know how to deal with Unicode.
+ There is also an option of doing UTF-8 translations with the
+ utf8 option.
+ utf8 Encode Unicode names in UTF-8 format. Default is no.
+
+Mount options unique to the isofs filesystem.
+ block=512 Set the block size for the disk to 512 bytes
+ block=1024 Set the block size for the disk to 1024 bytes
+ block=2048 Set the block size for the disk to 2048 bytes
+ check=relaxed Matches filenames with different cases
+ check=strict Matches only filenames with the exact same case
+ cruft Try to handle badly formatted CDs.
+ map=off Do not map non-Rock Ridge filenames to lower case
+ map=normal Map non-Rock Ridge filenames to lower case
+ map=acorn As map=normal but also apply Acorn extensions if present
+ mode=xxx Sets the permissions on files to xxx unless Rock Ridge
+ extensions set the permissions otherwise
+ dmode=xxx Sets the permissions on directories to xxx unless Rock Ridge
+ extensions set the permissions otherwise
+ overriderockperm Set permissions on files and directories according to
+ 'mode' and 'dmode' even though Rock Ridge extensions are
+ present.
+ nojoliet Ignore Joliet extensions if they are present.
+ norock Ignore Rock Ridge extensions if they are present.
+ hide Completely strip hidden files from the file system.
+ showassoc Show files marked with the 'associated' bit
+ unhide Deprecated; showing hidden files is now default;
+ If given, it is a synonym for 'showassoc' which will
+ recreate previous unhide behavior
+ session=x Select number of session on multisession CD
+ sbsector=xxx Session begins from sector xxx
+
+Recommended documents about ISO 9660 standard are located at:
+http://www.y-adagio.com/
+ftp://ftp.ecma.ch/ecma-st/Ecma-119.pdf
+Quoting from the PDF "This 2nd Edition of Standard ECMA-119 is technically
+identical with ISO 9660.", so it is a valid and gratis substitute of the
+official ISO specification.
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
new file mode 100644
index 00000000000..26ebde77e82
--- /dev/null
+++ b/Documentation/filesystems/jfs.txt
@@ -0,0 +1,41 @@
+IBM's Journaled File System (JFS) for Linux
+
+JFS Homepage: http://jfs.sourceforge.net/
+
+The following mount options are supported:
+
+iocharset=name Character set to use for converting from Unicode to
+ ASCII. The default is to do no conversion. Use
+ iocharset=utf8 for UTF-8 translations. This requires
+ CONFIG_NLS_UTF8 to be set in the kernel .config file.
+ iocharset=none specifies the default behavior explicitly.
+
+resize=value Resize the volume to <value> blocks. JFS only supports
+ growing a volume, not shrinking it. This option is only
+ valid during a remount, when the volume is mounted
+ read-write. The resize keyword with no value will grow
+ the volume to the full size of the partition.
+
+nointegrity Do not write to the journal. The primary use of this option
+ is to allow for higher performance when restoring a volume
+ from backup media. The integrity of the volume is not
+ guaranteed if the system abnormally abends.
+
+integrity Default. Commit metadata changes to the journal. Use this
+ option to remount a volume where the nointegrity option was
+ previously specified in order to restore normal behavior.
+
+errors=continue Keep going on a filesystem error.
+errors=remount-ro Default. Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+
+uid=value Override on-disk uid with specified value
+gid=value Override on-disk gid with specified value
+umask=value Override on-disk umask with specified octal value. For
+ directories, the execute bit will be set if the corresponding
+ read bit is set.
+
+Please send bugs, comments, cards and letters to shaggy@linux.vnet.ibm.com.
+
+The JFS mailing list can be subscribed to by using the link labeled
+"Mail list Subscribe" at our web page http://jfs.sourceforge.net/
diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.txt
new file mode 100644
index 00000000000..2cf81082581
--- /dev/null
+++ b/Documentation/filesystems/locks.txt
@@ -0,0 +1,68 @@
+ File Locking Release Notes
+
+ Andy Walker <andy@lysaker.kvaerner.no>
+
+ 12 May 1997
+
+
+1. What's New?
+--------------
+
+1.1 Broken Flock Emulation
+--------------------------
+
+The old flock(2) emulation in the kernel was swapped for proper BSD
+compatible flock(2) support in the 1.3.x series of kernels. With the
+release of the 2.1.x kernel series, support for the old emulation has
+been totally removed, so that we don't need to carry this baggage
+forever.
+
+This should not cause problems for anybody, since everybody using a
+2.1.x kernel should have updated their C library to a suitable version
+anyway (see the file "Documentation/Changes".)
+
+1.2 Allow Mixed Locks Again
+---------------------------
+
+1.2.1 Typical Problems - Sendmail
+---------------------------------
+Because sendmail was unable to use the old flock() emulation, many sendmail
+installations use fcntl() instead of flock(). This is true of Slackware 3.0
+for example. This gave rise to some other subtle problems if sendmail was
+configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
+file with fcntl() at the same time as the GDBM routines tried to lock this
+file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
+over time, or under a very heavy mail load, would eventually cause the kernel
+to lock solid with deadlocked processes.
+
+
+1.2.2 The Solution
+------------------
+The solution I have chosen, after much experimentation and discussion,
+is to make flock() and fcntl() locks oblivious to each other. Both can
+exists, and neither will have any effect on the other.
+
+I wanted the two lock styles to be cooperative, but there were so many
+race and deadlock conditions that the current solution was the only
+practical one. It puts us in the same position as, for example, SunOS
+4.1.x and several other commercial Unices. The only OS's that support
+cooperative flock()/fcntl() are those that emulate flock() using
+fcntl(), with all the problems that implies.
+
+
+1.3 Mandatory Locking As A Mount Option
+---------------------------------------
+
+Mandatory locking, as described in
+'Documentation/filesystems/mandatory-locking.txt' was prior to this release a
+general configuration option that was valid for all mounted filesystems. This
+had a number of inherent dangers, not the least of which was the ability to
+freeze an NFS server by asking it to read a file for which a mandatory lock
+existed.
+
+From this release of the kernel, mandatory locking can be turned on and off
+on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
+The default is to disallow mandatory locking. The intention is that
+mandatory locking only be enabled on a local filesystem as the specific need
+arises.
+
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 00000000000..bca42c22a14
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
+
+The LogFS Flash Filesystem
+==========================
+
+Specification
+=============
+
+Superblocks
+-----------
+
+Two superblocks exist at the beginning and end of the filesystem.
+Each superblock is 256 Bytes large, with another 3840 Bytes reserved
+for future purposes, making a total of 4096 Bytes.
+
+Superblock locations may differ for MTD and block devices. On MTD the
+first non-bad block contains a superblock in the first 4096 Bytes and
+the last non-bad block contains a superblock in the last 4096 Bytes.
+On block devices, the first 4096 Bytes of the device contain the first
+superblock and the last aligned 4096 Byte-block contains the second
+superblock.
+
+For the most part, the superblocks can be considered read-only. They
+are written only to correct errors detected within the superblocks,
+move the journal and change the filesystem parameters through tunefs.
+As a result, the superblock does not contain any fields that require
+constant updates, like the amount of free space, etc.
+
+Segments
+--------
+
+The space in the device is split up into equal-sized segments.
+Segments are the primary write unit of LogFS. Within each segments,
+writes happen from front (low addresses) to back (high addresses. If
+only a partial segment has been written, the segment number, the
+current position within and optionally a write buffer are stored in
+the journal.
+
+Segments are erased as a whole. Therefore Garbage Collection may be
+required to completely free a segment before doing so.
+
+Journal
+--------
+
+The journal contains all global information about the filesystem that
+is subject to frequent change. At mount time, it has to be scanned
+for the most recent commit entry, which contains a list of pointers to
+all currently valid entries.
+
+Object Store
+------------
+
+All space except for the superblocks and journal is part of the object
+store. Each segment contains a segment header and a number of
+objects, each consisting of the object header and the payload.
+Objects are either inodes, directory entries (dentries), file data
+blocks or indirect blocks.
+
+Levels
+------
+
+Garbage collection (GC) may fail if all data is written
+indiscriminately. One requirement of GC is that data is separated
+roughly according to the distance between the tree root and the data.
+Effectively that means all file data is on level 0, indirect blocks
+are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
+respectively. Inode file data is on level 6 for the inodes and 7-11
+for indirect blocks.
+
+Each segment contains objects of a single level only. As a result,
+each level requires its own separate segment to be open for writing.
+
+Inode File
+----------
+
+All inodes are stored in a special file, the inode file. Single
+exception is the inode file's inode (master inode) which for obvious
+reasons is stored in the journal instead. Instead of data blocks, the
+leaf nodes of the inode files are inodes.
+
+Aliases
+-------
+
+Writes in LogFS are done by means of a wandering tree. A naïve
+implementation would require that for each write or a block, all
+parent blocks are written as well, since the block pointers have
+changed. Such an implementation would not be very efficient.
+
+In LogFS, the block pointer changes are cached in the journal by means
+of alias entries. Each alias consists of its logical address - inode
+number, block index, level and child number (index into block) - and
+the changed data. Any 8-byte word can be changes in this manner.
+
+Currently aliases are used for block pointers, file size, file used
+bytes and the height of an inodes indirect tree.
+
+Segment Aliases
+---------------
+
+Related to regular aliases, these are used to handle bad blocks.
+Initially, bad blocks are handled by moving the affected segment
+content to a spare segment and noting this move in the journal with a
+segment alias, a simple (to, from) tupel. GC will later empty this
+segment and the alias can be removed again. This is used on MTD only.
+
+Vim
+---
+
+By cleverly predicting the life time of data, it is possible to
+separate long-living data from short-living data and thereby reduce
+the GC overhead later. Each type of distinc life expectency (vim) can
+have a separate segment open for writing. Each (level, vim) tupel can
+be open just once. If an open segment with unknown vim is encountered
+at mount time, it is closed and ignored henceforth.
+
+Indirect Tree
+-------------
+
+Inodes in LogFS are similar to FFS-style filesystems with direct and
+indirect block pointers. One difference is that LogFS uses a single
+indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
+A height field in the inode defines the height of the indirect tree
+and thereby the indirection of the pointer.
+
+Another difference is the addressing of indirect blocks. In LogFS,
+the first 16 pointers in the first indirect block are left empty,
+corresponding to the 16 direct pointers in the inode. In ext2 (maybe
+others as well) the first pointer in the first indirect block
+corresponds to logical block 12, skipping the 12 direct pointers.
+So where ext2 is using arithmetic to better utilize space, LogFS keeps
+arithmetic simple and uses compression to save space.
+
+Compression
+-----------
+
+Both file data and metadata can be compressed. Compression for file
+data can be enabled with chattr +c and disabled with chattr -c. Doing
+so has no effect on existing data, but new data will be stored
+accordingly. New inodes will inherit the compression flag of the
+parent directory.
+
+Metadata is always compressed. However, the space accounting ignores
+this and charges for the uncompressed size. Failing to do so could
+result in GC failures when, after moving some data, indirect blocks
+compress worse than previously. Even on a 100% full medium, GC may
+not consume any extra space, so the compression gains are lost space
+to the user.
+
+However, they are not lost space to the filesystem internals. By
+cheating the user for those bytes, the filesystem gained some slack
+space and GC will run less often and faster.
+
+Garbage Collection and Wear Leveling
+------------------------------------
+
+Garbage collection is invoked whenever the number of free segments
+falls below a threshold. The best (known) candidate is picked based
+on the least amount of valid data contained in the segment. All
+remaining valid data is copied elsewhere, thereby invalidating it.
+
+The GC code also checks for aliases and writes then back if their
+number gets too large.
+
+Wear leveling is done by occasionally picking a suboptimal segment for
+garbage collection. If a stale segments erase count is significantly
+lower than the active segments' erase counts, it will be picked. Wear
+leveling is rate limited, so it will never monopolize the device for
+more than one segment worth at a time.
+
+Values for "occasionally", "significantly lower" are compile time
+constants.
+
+Hashed directories
+------------------
+
+To satisfy efficient lookup(), directory entries are hashed and
+located based on the hash. In order to both support large directories
+and not be overly inefficient for small directories, several hash
+tables of increasing size are used. For each table, the hash value
+modulo the table size gives the table index.
+
+Tables sizes are chosen to limit the number of indirect blocks with a
+fully populated table to 0, 1, 2 or 3 respectively. So the first
+table contains 16 entries, the second 512-16, etc.
+
+The last table is special in several ways. First its size depends on
+the effective 32bit limit on telldir/seekdir cookies. Since logfs
+uses the upper half of the address space for indirect blocks, the size
+is limited to 2^31. Secondly the table contains hash buckets with 16
+entries each.
+
+Using single-entry buckets would result in birthday "attacks". At
+just 2^16 used entries, hash collisions would be likely (P >= 0.5).
+My math skills are insufficient to do the combinatorics for the 17x
+collisions necessary to overflow a bucket, but testing showed that in
+10,000 runs the lowest directory fill before a bucket overflow was
+188,057,130 entries with an average of 315,149,915 entries. So for
+directory sizes of up to a million, bucket overflows should be
+virtually impossible under normal circumstances.
+
+With carefully chosen filenames, it is obviously possible to cause an
+overflow with just 21 entries (4 higher tables + 16 entries + 1). So
+there may be a security concern if a malicious user has write access
+to a directory.
+
+Open For Discussion
+===================
+
+Device Address Space
+--------------------
+
+A device address space is used for caching. Both block devices and
+MTD provide functions to either read a single page or write a segment.
+Partial segments may be written for data integrity, but where possible
+complete segments are written for performance on simple block device
+flash media.
+
+Meta Inodes
+-----------
+
+Inodes are stored in the inode file, which is just a regular file for
+most purposes. At umount time, however, the inode file needs to
+remain open until all dirty inodes are written. So
+generic_shutdown_super() may not close this inode, but shouldn't
+complain about remaining inodes due to the inode file either. Same
+goes for mapping inode of the device address space.
+
+Currently logfs uses a hack that essentially copies part of fs/inode.c
+code over. A general solution would be preferred.
+
+Indirect block mapping
+----------------------
+
+With compression, the block device (or mapping inode) cannot be used
+to cache indirect blocks. Some other place is required. Currently
+logfs uses the top half of each inode's address space. The low 8TB
+(on 32bit) are filled with file data, the high 8TB are used for
+indirect blocks.
+
+One problem is that 16TB files created on 64bit systems actually have
+data in the top 8TB. But files >16TB would cause problems anyway, so
+only the limit has changed.
diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.txt
new file mode 100644
index 00000000000..0979d1d2ca8
--- /dev/null
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -0,0 +1,171 @@
+ Mandatory File Locking For The Linux Operating System
+
+ Andy Walker <andy@lysaker.kvaerner.no>
+
+ 15 April 1996
+ (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+ - The write system call checks for a mandatory lock only once
+ at its start. It is therefore possible for a lock request to
+ be granted after this check but before the data is modified.
+ A process may then see file data change even while a mandatory
+ lock was held.
+ - Similarly, an exclusive lock may be granted on a file after
+ the kernel has decided to proceed with a read, but before the
+ read has actually completed, and the reading process may see
+ the file data in a state which should not have been visible
+ to it.
+ - Similar races make the claimed mutual exclusion between lock
+ and mmap similarly unreliable.
+
+1. What is mandatory locking?
+------------------------------
+
+Mandatory locking is kernel enforced file locking, as opposed to the more usual
+cooperative file locking used to guarantee sequential access to files among
+processes. File locks are applied using the flock() and fcntl() system calls
+(and the lockf() library routine which is a wrapper around fcntl().) It is
+normally a process' responsibility to check for locks on a file it wishes to
+update, before applying its own lock, updating the file and unlocking it again.
+The most commonly used example of this (and in the case of sendmail, the most
+troublesome) is access to a user's mailbox. The mail user agent and the mail
+transfer agent must guard against updating the mailbox at the same time, and
+prevent reading the mailbox while it is being updated.
+
+In a perfect world all processes would use and honour a cooperative, or
+"advisory" locking scheme. However, the world isn't perfect, and there's
+a lot of poorly written code out there.
+
+In trying to address this problem, the designers of System V UNIX came up
+with a "mandatory" locking scheme, whereby the operating system kernel would
+block attempts by a process to write to a file that another process holds a
+"read" -or- "shared" lock on, and block attempts to both read and write to a
+file that a process holds a "write " -or- "exclusive" lock on.
+
+The System V mandatory locking scheme was intended to have as little impact as
+possible on existing user code. The scheme is based on marking individual files
+as candidates for mandatory locking, and using the existing fcntl()/lockf()
+interface for applying locks just as if they were normal, advisory locks.
+
+Note 1: In saying "file" in the paragraphs above I am actually not telling
+the whole truth. System V locking is based on fcntl(). The granularity of
+fcntl() is such that it allows the locking of byte ranges in files, in addition
+to entire files, so the mandatory locking rules also have byte level
+granularity.
+
+Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
+borrowing the fcntl() locking scheme from System V. The mandatory locking
+scheme is defined by the System V Interface Definition (SVID) Version 3.
+
+2. Marking a file for mandatory locking
+---------------------------------------
+
+A file is marked as a candidate for mandatory locking by setting the group-id
+bit in its file mode but removing the group-execute bit. This is an otherwise
+meaningless combination, and was chosen by the System V implementors so as not
+to break existing user programs.
+
+Note that the group-id bit is usually automatically cleared by the kernel when
+a setgid file is written to. This is a security measure. The kernel has been
+modified to recognize the special case of a mandatory lock candidate and to
+refrain from clearing this bit. Similarly the kernel has been modified not
+to run mandatory lock candidates with setgid privileges.
+
+3. Available implementations
+----------------------------
+
+I have considered the implementations of mandatory locking available with
+SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
+
+Generally I have tried to make the most sense out of the behaviour exhibited
+by these three reference systems. There are many anomalies.
+
+All the reference systems reject all calls to open() for a file on which
+another process has outstanding mandatory locks. This is in direct
+contravention of SVID 3, which states that only calls to open() with the
+O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
+definition, which is the "Right Thing", since only calls with O_TRUNC can
+modify the contents of the file.
+
+HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
+just mandatory locks. That would appear to contravene POSIX.1.
+
+mmap() is another interesting case. All the operating systems mentioned
+prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX
+also disallows advisory locks for such a file. SVID actually specifies the
+paranoid HP-UX behaviour.
+
+In my opinion only MAP_SHARED mappings should be immune from locking, and then
+only from mandatory locks - that is what is currently implemented.
+
+SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
+mandatory locks, so reads and writes to locked files always block when they
+should return EAGAIN.
+
+I'm afraid that this is such an esoteric area that the semantics described
+below are just as valid as any others, so long as the main points seem to
+agree.
+
+4. Semantics
+------------
+
+1. Mandatory locks can only be applied via the fcntl()/lockf() locking
+ interface - in other words the System V/POSIX interface. BSD style
+ locks using flock() never result in a mandatory lock.
+
+2. If a process has locked a region of a file with a mandatory read lock, then
+ other processes are permitted to read from that region. If any of these
+ processes attempts to write to the region it will block until the lock is
+ released, unless the process has opened the file with the O_NONBLOCK
+ flag in which case the system call will return immediately with the error
+ status EAGAIN.
+
+3. If a process has locked a region of a file with a mandatory write lock, all
+ attempts to read or write to that region block until the lock is released,
+ unless a process has opened the file with the O_NONBLOCK flag in which case
+ the system call will return immediately with the error status EAGAIN.
+
+4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
+ any mandatory locks owned by other processes will be rejected with the
+ error status EAGAIN.
+
+5. Attempts to apply a mandatory lock to a file that is memory mapped and
+ shared (via mmap() with MAP_SHARED) will be rejected with the error status
+ EAGAIN.
+
+6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
+ that has any mandatory locks in effect will be rejected with the error status
+ EAGAIN.
+
+5. Which system calls are affected?
+-----------------------------------
+
+Those which modify a file's contents, not just the inode. That gives read(),
+write(), readv(), writev(), open(), creat(), mmap(), truncate() and
+ftruncate(). truncate() and ftruncate() are considered to be "write" actions
+for the purposes of mandatory locking.
+
+The affected region is usually defined as stretching from the current position
+for the total number of bytes read or written. For the truncate calls it is
+defined as the bytes of a file removed or added (we must also consider bytes
+added, as a lock can specify just "the whole file", rather than a specific
+range of bytes.)
+
+Note 3: I may have overlooked some system calls that need mandatory lock
+checking in my eagerness to get this code out the door. Please let me know, or
+better still fix the system calls yourself and submit a patch to me or Linus.
+
+6. Warning!
+-----------
+
+Not even root can override a mandatory lock, so runaway processes can wreak
+havoc if they lock crucial files. The way around it is to change the file
+permissions (remove the setgid bit) before trying to read or write to it.
+Of course, that might be a bit tricky if the system is hung :-(
+
diff --git a/Documentation/filesystems/ncpfs.txt b/Documentation/filesystems/ncpfs.txt
new file mode 100644
index 00000000000..5af164f4b37
--- /dev/null
+++ b/Documentation/filesystems/ncpfs.txt
@@ -0,0 +1,12 @@
+The ncpfs filesystem understands the NCP protocol, designed by the
+Novell Corporation for their NetWare(tm) product. NCP is functionally
+similar to the NFS used in the TCP/IP community.
+To mount a NetWare filesystem, you need a special mount program, which
+can be found in the ncpfs package. The home site for ncpfs is
+ftp.gwdg.de/pub/linux/misc/ncpfs, but sunsite and its many mirrors
+will have it as well.
+
+Related products are linware and mars_nwe, which will give Linux partial
+NetWare server functionality.
+
+mars_nwe can be found on ftp.gwdg.de/pub/linux/misc/ncpfs.
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
new file mode 100644
index 00000000000..1716874a651
--- /dev/null
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -0,0 +1,22 @@
+00-INDEX
+ - this file (nfs-related documentation).
+Exporting
+ - explanation of how to make filesystems exportable.
+fault_injection.txt
+ - information for using fault injection on the server
+knfsd-stats.txt
+ - statistics which the NFS server makes available to user space.
+nfs.txt
+ - nfs client, and DNS resolution for fs_locations.
+nfs41-server.txt
+ - info on the Linux server implementation of NFSv4 minor version 1.
+nfs-rdma.txt
+ - how to install and setup the Linux NFS/RDMA client and server software
+nfsroot.txt
+ - short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+ - short explanation of some of the internals of the pnfs client code
+rpc-cache.txt
+ - introduction to the caching mechanisms in the sunrpc layer.
+idmapper.txt
+ - information for configuring request-keys to be used by idmapper
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
new file mode 100644
index 00000000000..09994c24728
--- /dev/null
+++ b/Documentation/filesystems/nfs/Exporting
@@ -0,0 +1,154 @@
+
+Making Filesystems Exportable
+=============================
+
+Overview
+--------
+
+All filesystem operations require a dentry (or two) as a starting
+point. Local applications have a reference-counted hold on suitable
+dentries via open file descriptors or cwd/root. However remote
+applications that access a filesystem via a remote filesystem protocol
+such as NFS may not be able to hold such a reference, and so need a
+different way to refer to a particular dentry. As the alternative
+form of reference needs to be stable across renames, truncates, and
+server-reboot (among other things, though these tend to be the most
+problematic), there is no simple answer like 'filename'.
+
+The mechanism discussed here allows each filesystem implementation to
+specify how to generate an opaque (outside of the filesystem) byte
+string for any dentry, and how to find an appropriate dentry for any
+given opaque byte string.
+This byte string will be called a "filehandle fragment" as it
+corresponds to part of an NFS filehandle.
+
+A filesystem which supports the mapping between filehandle fragments
+and dentries will be termed "exportable".
+
+
+
+Dcache Issues
+-------------
+
+The dcache normally contains a proper prefix of any given filesystem
+tree. This means that if any filesystem object is in the dcache, then
+all of the ancestors of that filesystem object are also in the dcache.
+As normal access is by filename this prefix is created naturally and
+maintained easily (by each object maintaining a reference count on
+its parent).
+
+However when objects are included into the dcache by interpreting a
+filehandle fragment, there is no automatic creation of a path prefix
+for the object. This leads to two related but distinct features of
+the dcache that are not needed for normal filesystem access.
+
+1/ The dcache must sometimes contain objects that are not part of the
+ proper prefix. i.e that are not connected to the root.
+2/ The dcache must be prepared for a newly found (via ->lookup) directory
+ to already have a (non-connected) dentry, and must be able to move
+ that dentry into place (based on the parent and name in the
+ ->lookup). This is particularly needed for directories as
+ it is a dcache invariant that directories only have one dentry.
+
+To implement these features, the dcache has:
+
+a/ A dentry flag DCACHE_DISCONNECTED which is set on
+ any dentry that might not be part of the proper prefix.
+ This is set when anonymous dentries are created, and cleared when a
+ dentry is noticed to be a child of a dentry which is in the proper
+ prefix.
+
+b/ A per-superblock list "s_anon" of dentries which are the roots of
+ subtrees that are not in the proper prefix. These dentries, as
+ well as the proper prefix, need to be released at unmount time. As
+ these dentries will not be hashed, they are linked together on the
+ d_hash list_head.
+
+c/ Helper routines to allocate anonymous dentries, and to help attach
+ loose directory dentries at lookup time. They are:
+ d_alloc_anon(inode) will return a dentry for the given inode.
+ If the inode already has a dentry, one of those is returned.
+ If it doesn't, a new anonymous (IS_ROOT and
+ DCACHE_DISCONNECTED) dentry is allocated and attached.
+ In the case of a directory, care is taken that only one dentry
+ can ever be attached.
+ d_splice_alias(inode, dentry) will make sure that there is a
+ dentry with the same name and parent as the given dentry, and
+ which refers to the given inode.
+ If the inode is a directory and already has a dentry, then that
+ dentry is d_moved over the given dentry.
+ If the passed dentry gets attached, care is taken that this is
+ mutually exclusive to a d_alloc_anon operation.
+ If the passed dentry is used, NULL is returned, else the used
+ dentry is returned. This corresponds to the calling pattern of
+ ->lookup.
+
+
+Filesystem Issues
+-----------------
+
+For a filesystem to be exportable it must:
+
+ 1/ provide the filehandle fragment routines described below.
+ 2/ make sure that d_splice_alias is used rather than d_add
+ when ->lookup finds an inode for a given parent and name.
+
+ If inode is NULL, d_splice_alias(inode, dentry) is eqivalent to
+
+ d_add(dentry, inode), NULL
+
+ Similarly, d_splice_alias(ERR_PTR(err), dentry) = ERR_PTR(err)
+
+ Typically the ->lookup routine will simply end with a:
+
+ return d_splice_alias(inode, dentry);
+ }
+
+
+
+ A file system implementation declares that instances of the filesystem
+are exportable by setting the s_export_op field in the struct
+super_block. This field must point to a "struct export_operations"
+struct which has the following members:
+
+ encode_fh (optional)
+ Takes a dentry and creates a filehandle fragment which can later be used
+ to find or create a dentry for the same object. The default
+ implementation creates a filehandle fragment that encodes a 32bit inode
+ and generation number for the inode encoded, and if necessary the
+ same information for the parent.
+
+ fh_to_dentry (mandatory)
+ Given a filehandle fragment, this should find the implied object and
+ create a dentry for it (possibly with d_alloc_anon).
+
+ fh_to_parent (optional but strongly recommended)
+ Given a filehandle fragment, this should find the parent of the
+ implied object and create a dentry for it (possibly with d_alloc_anon).
+ May fail if the filehandle fragment is too small.
+
+ get_parent (optional but strongly recommended)
+ When given a dentry for a directory, this should return a dentry for
+ the parent. Quite possibly the parent dentry will have been allocated
+ by d_alloc_anon. The default get_parent function just returns an error
+ so any filehandle lookup that requires finding a parent will fail.
+ ->lookup("..") is *not* used as a default as it can leave ".." entries
+ in the dcache which are too messy to work with.
+
+ get_name (optional)
+ When given a parent dentry and a child dentry, this should find a name
+ in the directory identified by the parent dentry, which leads to the
+ object identified by the child dentry. If no get_name function is
+ supplied, a default implementation is provided which uses vfs_readdir
+ to find potential names, and matches inode numbers to find the correct
+ match.
+
+
+A filehandle fragment consists of an array of 1 or more 4byte words,
+together with a one byte "type".
+The decode_fh routine should not depend on the stated size that is
+passed to it. This size may be larger than the original filehandle
+generated by encode_fh, in which case it will have been padded with
+nuls. Rather, the encode_fh routine should choose a "type" which
+indicates the decode_fh how much of the filehandle is valid, and how
+it should be interpreted.
diff --git a/Documentation/filesystems/nfs/fault_injection.txt b/Documentation/filesystems/nfs/fault_injection.txt
new file mode 100644
index 00000000000..426d166089a
--- /dev/null
+++ b/Documentation/filesystems/nfs/fault_injection.txt
@@ -0,0 +1,69 @@
+
+Fault Injection
+===============
+Fault injection is a method for forcing errors that may not normally occur, or
+may be difficult to reproduce. Forcing these errors in a controlled environment
+can help the developer find and fix bugs before their code is shipped in a
+production system. Injecting an error on the Linux NFS server will allow us to
+observe how the client reacts and if it manages to recover its state correctly.
+
+NFSD_FAULT_INJECTION must be selected when configuring the kernel to use this
+feature.
+
+
+Using Fault Injection
+=====================
+On the client, mount the fault injection server through NFS v4.0+ and do some
+work over NFS (open files, take locks, ...).
+
+On the server, mount the debugfs filesystem to <debug_dir> and ls
+<debug_dir>/nfsd. This will show a list of files that will be used for
+injecting faults on the NFS server. As root, write a number n to the file
+corresponding to the action you want the server to take. The server will then
+process the first n items it finds. So if you want to forget 5 locks, echo '5'
+to <debug_dir>/nfsd/forget_locks. A value of 0 will tell the server to forget
+all corresponding items. A log message will be created containing the number
+of items forgotten (check dmesg).
+
+Go back to work on the client and check if the client recovered from the error
+correctly.
+
+
+Available Faults
+================
+forget_clients:
+ The NFS server keeps a list of clients that have placed a mount call. If
+ this list is cleared, the server will have no knowledge of who the client
+ is, forcing the client to reauthenticate with the server.
+
+forget_openowners:
+ The NFS server keeps a list of what files are currently opened and who
+ they were opened by. Clearing this list will force the client to reopen
+ its files.
+
+forget_locks:
+ The NFS server keeps a list of what files are currently locked in the VFS.
+ Clearing this list will force the client to reclaim its locks (files are
+ unlocked through the VFS as they are cleared from this list).
+
+forget_delegations:
+ A delegation is used to assure the client that a file, or part of a file,
+ has not changed since the delegation was awarded. Clearing this list will
+ force the client to reaquire its delegation before accessing the file
+ again.
+
+recall_delegations:
+ Delegations can be recalled by the server when another client attempts to
+ access a file. This test will notify the client that its delegation has
+ been revoked, forcing the client to reaquire the delegation before using
+ the file again.
+
+
+tools/nfs/inject_faults.sh script
+=================================
+This script has been created to ease the fault injection process. This script
+will detect the mounted debugfs directory and write to the files located there
+based on the arguments passed by the user. For example, running
+`inject_faults.sh forget_locks 1` as root will instruct the server to forget
+one lock. Running `inject_faults forget_locks` will instruct the server to
+forgetall locks.
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
new file mode 100644
index 00000000000..fe03d10bb79
--- /dev/null
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -0,0 +1,75 @@
+
+=========
+ID Mapper
+=========
+Id mapper is used by NFS to translate user and group ids into names, and to
+translate user and group names into ids. Part of this translation involves
+performing an upcall to userspace to request the information. There are two
+ways NFS could obtain this information: placing a call to /sbin/request-key
+or by placing a call to the rpc.idmap daemon.
+
+NFS will attempt to call /sbin/request-key first. If this succeeds, the
+result will be cached using the generic request-key cache. This call should
+only fail if /etc/request-key.conf is not configured for the id_resolver key
+type, see the "Configuring" section below if you wish to use the request-key
+method.
+
+If the call to /sbin/request-key fails (if /etc/request-key.conf is not
+configured with the id_resolver key type), then the idmapper will ask the
+legacy rpc.idmap daemon for the id mapping. This result will be stored
+in a custom NFS idmap cache.
+
+
+===========
+Configuring
+===========
+The file /etc/request-key.conf will need to be modified so /sbin/request-key can
+direct the upcall. The following line should be added:
+
+#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
+#====== ======= =============== =============== ===============================
+create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
+
+This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
+The last parameter, 600, defines how many seconds into the future the key will
+expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
+is not specified, nfs.idmap will default to 600 seconds.
+
+id mapper uses for key descriptions:
+ uid: Find the UID for the given user
+ gid: Find the GID for the given group
+ user: Find the user name for the given UID
+ group: Find the group name for the given GID
+
+You can handle any of these individually, rather than using the generic upcall
+program. If you would like to use your own program for a uid lookup then you
+would edit your request-key.conf so it look similar to this:
+
+#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...
+#====== ======= =============== =============== ===============================
+create id_resolver uid:* * /some/other/program %k %d 600
+create id_resolver * * /usr/sbin/nfs.idmap %k %d 600
+
+Notice that the new line was added above the line for the generic program.
+request-key will find the first matching line and corresponding program. In
+this case, /some/other/program will handle all uid lookups and
+/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
+
+See <file:Documentation/security/keys-request-key.txt> for more information
+about the request-key function.
+
+
+=========
+nfs.idmap
+=========
+nfs.idmap is designed to be called by request-key, and should not be run "by
+hand". This program takes two arguments, a serialized key and a key
+description. The serialized key is first converted into a key_serial_t, and
+then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
+
+The actual lookups are performed by functions found in nfsidmap.h. nfs.idmap
+determines the correct function to call by looking at the first part of the
+description string. For example, a uid lookup description will appear as
+"uid:user@domain".
+
+nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise.
diff --git a/Documentation/filesystems/nfs/knfsd-stats.txt b/Documentation/filesystems/nfs/knfsd-stats.txt
new file mode 100644
index 00000000000..64ced5149d3
--- /dev/null
+++ b/Documentation/filesystems/nfs/knfsd-stats.txt
@@ -0,0 +1,159 @@
+
+Kernel NFS Server Statistics
+============================
+
+This document describes the format and semantics of the statistics
+which the kernel NFS server makes available to userspace. These
+statistics are available in several text form pseudo files, each of
+which is described separately below.
+
+In most cases you don't need to know these formats, as the nfsstat(8)
+program from the nfs-utils distribution provides a helpful command-line
+interface for extracting and printing them.
+
+All the files described here are formatted as a sequence of text lines,
+separated by newline '\n' characters. Lines beginning with a hash
+'#' character are comments intended for humans and should be ignored
+by parsing routines. All other lines contain a sequence of fields
+separated by whitespace.
+
+/proc/fs/nfsd/pool_stats
+------------------------
+
+This file is available in kernels from 2.6.30 onwards, if the
+/proc/fs/nfsd filesystem is mounted (it almost always should be).
+
+The first line is a comment which describes the fields present in
+all the other lines. The other lines present the following data as
+a sequence of unsigned decimal numeric fields. One line is shown
+for each NFS thread pool.
+
+All counters are 64 bits wide and wrap naturally. There is no way
+to zero these counters, instead applications should do their own
+rate conversion.
+
+pool
+ The id number of the NFS thread pool to which this line applies.
+ This number does not change.
+
+ Thread pool ids are a contiguous set of small integers starting
+ at zero. The maximum value depends on the thread pool mode, but
+ currently cannot be larger than the number of CPUs in the system.
+ Note that in the default case there will be a single thread pool
+ which contains all the nfsd threads and all the CPUs in the system,
+ and thus this file will have a single line with a pool id of "0".
+
+packets-arrived
+ Counts how many NFS packets have arrived. More precisely, this
+ is the number of times that the network stack has notified the
+ sunrpc server layer that new data may be available on a transport
+ (e.g. an NFS or UDP socket or an NFS/RDMA endpoint).
+
+ Depending on the NFS workload patterns and various network stack
+ effects (such as Large Receive Offload) which can combine packets
+ on the wire, this may be either more or less than the number
+ of NFS calls received (which statistic is available elsewhere).
+ However this is a more accurate and less workload-dependent measure
+ of how much CPU load is being placed on the sunrpc server layer
+ due to NFS network traffic.
+
+sockets-enqueued
+ Counts how many times an NFS transport is enqueued to wait for
+ an nfsd thread to service it, i.e. no nfsd thread was considered
+ available.
+
+ The circumstance this statistic tracks indicates that there was NFS
+ network-facing work to be done but it couldn't be done immediately,
+ thus introducing a small delay in servicing NFS calls. The ideal
+ rate of change for this counter is zero; significantly non-zero
+ values may indicate a performance limitation.
+
+ This can happen either because there are too few nfsd threads in the
+ thread pool for the NFS workload (the workload is thread-limited),
+ or because the NFS workload needs more CPU time than is available in
+ the thread pool (the workload is CPU-limited). In the former case,
+ configuring more nfsd threads will probably improve the performance
+ of the NFS workload. In the latter case, the sunrpc server layer is
+ already choosing not to wake idle nfsd threads because there are too
+ many nfsd threads which want to run but cannot, so configuring more
+ nfsd threads will make no difference whatsoever. The overloads-avoided
+ statistic (see below) can be used to distinguish these cases.
+
+threads-woken
+ Counts how many times an idle nfsd thread is woken to try to
+ receive some data from an NFS transport.
+
+ This statistic tracks the circumstance where incoming
+ network-facing NFS work is being handled quickly, which is a good
+ thing. The ideal rate of change for this counter will be close
+ to but less than the rate of change of the packets-arrived counter.
+
+overloads-avoided
+ Counts how many times the sunrpc server layer chose not to wake an
+ nfsd thread, despite the presence of idle nfsd threads, because
+ too many nfsd threads had been recently woken but could not get
+ enough CPU time to actually run.
+
+ This statistic counts a circumstance where the sunrpc layer
+ heuristically avoids overloading the CPU scheduler with too many
+ runnable nfsd threads. The ideal rate of change for this counter
+ is zero. Significant non-zero values indicate that the workload
+ is CPU limited. Usually this is associated with heavy CPU usage
+ on all the CPUs in the nfsd thread pool.
+
+ If a sustained large overloads-avoided rate is detected on a pool,
+ the top(1) utility should be used to check for the following
+ pattern of CPU usage on all the CPUs associated with the given
+ nfsd thread pool.
+
+ - %us ~= 0 (as you're *NOT* running applications on your NFS server)
+
+ - %wa ~= 0
+
+ - %id ~= 0
+
+ - %sy + %hi + %si ~= 100
+
+ If this pattern is seen, configuring more nfsd threads will *not*
+ improve the performance of the workload. If this patten is not
+ seen, then something more subtle is wrong.
+
+threads-timedout
+ Counts how many times an nfsd thread triggered an idle timeout,
+ i.e. was not woken to handle any incoming network packets for
+ some time.
+
+ This statistic counts a circumstance where there are more nfsd
+ threads configured than can be used by the NFS workload. This is
+ a clue that the number of nfsd threads can be reduced without
+ affecting performance. Unfortunately, it's only a clue and not
+ a strong indication, for a couple of reasons:
+
+ - Currently the rate at which the counter is incremented is quite
+ slow; the idle timeout is 60 minutes. Unless the NFS workload
+ remains constant for hours at a time, this counter is unlikely
+ to be providing information that is still useful.
+
+ - It is usually a wise policy to provide some slack,
+ i.e. configure a few more nfsds than are currently needed,
+ to allow for future spikes in load.
+
+
+Note that incoming packets on NFS transports will be dealt with in
+one of three ways. An nfsd thread can be woken (threads-woken counts
+this case), or the transport can be enqueued for later attention
+(sockets-enqueued counts this case), or the packet can be temporarily
+deferred because the transport is currently being used by an nfsd
+thread. This last case is not very interesting and is not explicitly
+counted, but can be inferred from the other counters thus:
+
+packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken )
+
+
+More
+----
+Descriptions of the other statistics file should go here.
+
+
+Greg Banks <gnb@sgi.com>
+26 Mar 2009
diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
new file mode 100644
index 00000000000..e386f7e4bce
--- /dev/null
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
@@ -0,0 +1,271 @@
+################################################################################
+# #
+# NFS/RDMA README #
+# #
+################################################################################
+
+ Author: NetApp and Open Grid Computing
+ Date: May 29, 2008
+
+Table of Contents
+~~~~~~~~~~~~~~~~~
+ - Overview
+ - Getting Help
+ - Installation
+ - Check RDMA and NFS Setup
+ - NFS/RDMA Setup
+
+Overview
+~~~~~~~~
+
+ This document describes how to install and setup the Linux NFS/RDMA client
+ and server software.
+
+ The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
+ was first included in the following release, Linux 2.6.25.
+
+ In our testing, we have obtained excellent performance results (full 10Gbit
+ wire bandwidth at minimal client CPU) under many workloads. The code passes
+ the full Connectathon test suite and operates over both Infiniband and iWARP
+ RDMA adapters.
+
+Getting Help
+~~~~~~~~~~~~
+
+ If you get stuck, you can ask questions on the
+
+ nfs-rdma-devel@lists.sourceforge.net
+
+ mailing list.
+
+Installation
+~~~~~~~~~~~~
+
+ These instructions are a step by step guide to building a machine for
+ use with NFS/RDMA.
+
+ - Install an RDMA device
+
+ Any device supported by the drivers in drivers/infiniband/hw is acceptable.
+
+ Testing has been performed using several Mellanox-based IB cards, the
+ Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
+
+ - Install a Linux distribution and tools
+
+ The first kernel release to contain both the NFS/RDMA client and server was
+ Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
+ Linux kernel release should be installed.
+
+ The procedures described in this document have been tested with
+ distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
+
+ - Install nfs-utils-1.1.2 or greater on the client
+
+ An NFS/RDMA mount point can be obtained by using the mount.nfs command in
+ nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
+ version with support for NFS/RDMA mounts, but for various reasons we
+ recommend using nfs-utils-1.1.2 or greater). To see which version of
+ mount.nfs you are using, type:
+
+ $ /sbin/mount.nfs -V
+
+ If the version is less than 1.1.2 or the command does not exist,
+ you should install the latest version of nfs-utils.
+
+ Download the latest package from:
+
+ http://www.kernel.org/pub/linux/utils/nfs
+
+ Uncompress the package and follow the installation instructions.
+
+ If you will not need the idmapper and gssd executables (you do not need
+ these to create an NFS/RDMA enabled mount command), the installation
+ process can be simplified by disabling these features when running
+ configure:
+
+ $ ./configure --disable-gss --disable-nfsv4
+
+ To build nfs-utils you will need the tcp_wrappers package installed. For
+ more information on this see the package's README and INSTALL files.
+
+ After building the nfs-utils package, there will be a mount.nfs binary in
+ the utils/mount directory. This binary can be used to initiate NFS v2, v3,
+ or v4 mounts. To initiate a v4 mount, the binary must be called
+ mount.nfs4. The standard technique is to create a symlink called
+ mount.nfs4 to mount.nfs.
+
+ This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+
+ $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+
+ In this location, mount.nfs will be invoked automatically for NFS mounts
+ by the system mount command.
+
+ NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
+ on the NFS client machine. You do not need this specific version of
+ nfs-utils on the server. Furthermore, only the mount.nfs command from
+ nfs-utils-1.1.2 is needed on the client.
+
+ - Install a Linux kernel with NFS/RDMA
+
+ The NFS/RDMA client and server are both included in the mainline Linux
+ kernel version 2.6.25 and later. This and other versions of the 2.6 Linux
+ kernel can be found at:
+
+ ftp://ftp.kernel.org/pub/linux/kernel/v2.6/
+
+ Download the sources and place them in an appropriate location.
+
+ - Configure the RDMA stack
+
+ Make sure your kernel configuration has RDMA support enabled. Under
+ Device Drivers -> InfiniBand support, update the kernel configuration
+ to enable InfiniBand support [NOTE: the option name is misleading. Enabling
+ InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
+
+ Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
+ iWARP adapter support (amso, cxgb3, etc.).
+
+ If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
+
+ - Configure the NFS client and server
+
+ Your kernel configuration must also have NFS file system support and/or
+ NFS server support enabled. These and other NFS related configuration
+ options can be found under File Systems -> Network File Systems.
+
+ - Build, install, reboot
+
+ The NFS/RDMA code will be enabled automatically if NFS and RDMA
+ are turned on. The NFS/RDMA client and server are configured via the hidden
+ SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
+ value of SUNRPC_XPRT_RDMA will be:
+
+ - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
+ and server will not be built
+ - M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
+ in this case the NFS/RDMA client and server will be built as modules
+ - Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
+ and server will be built into the kernel
+
+ Therefore, if you have followed the steps above and turned no NFS and RDMA,
+ the NFS/RDMA client and server will be built.
+
+ Build a new kernel, install it, boot it.
+
+Check RDMA and NFS Setup
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+ Before configuring the NFS/RDMA software, it is a good idea to test
+ your new kernel to ensure that the kernel is working correctly.
+ In particular, it is a good idea to verify that the RDMA stack
+ is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
+ is working properly.
+
+ - Check RDMA Setup
+
+ If you built the RDMA components as modules, load them at
+ this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
+ card:
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+
+ If you are using InfiniBand, make sure there is a Subnet Manager (SM)
+ running on the network. If your IB switch has an embedded SM, you can
+ use it. Otherwise, you will need to run an SM, such as OpenSM, on one
+ of your end nodes.
+
+ If an SM is running on your network, you should see the following:
+
+ $ cat /sys/class/infiniband/driverX/ports/1/state
+ 4: ACTIVE
+
+ where driverX is mthca0, ipath5, ehca3, etc.
+
+ To further test the InfiniBand software stack, use IPoIB (this
+ assumes you have two IB hosts named host1 and host2):
+
+ host1$ ifconfig ib0 a.b.c.x
+ host2$ ifconfig ib0 a.b.c.y
+ host1$ ping a.b.c.y
+ host2$ ping a.b.c.x
+
+ For other device types, follow the appropriate procedures.
+
+ - Check NFS Setup
+
+ For the NFS components enabled above (client and/or server),
+ test their functionality over standard Ethernet using TCP/IP or UDP/IP.
+
+NFS/RDMA Setup
+~~~~~~~~~~~~~~
+
+ We recommend that you use two machines, one to act as the client and
+ one to act as the server.
+
+ One time configuration:
+
+ - On the server system, configure the /etc/exports file and
+ start the NFS/RDMA server.
+
+ Exports entries with the following formats have been tested:
+
+ /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
+ /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
+
+ The IP address(es) is(are) the client's IPoIB address for an InfiniBand
+ HCA or the cleint's iWARP address(es) for an RNIC.
+
+ NOTE: The "insecure" option must be used because the NFS/RDMA client does
+ not use a reserved port.
+
+ Each time a machine boots:
+
+ - Load and configure the RDMA drivers
+
+ For InfiniBand using a Mellanox adapter:
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+ $ ifconfig ib0 a.b.c.d
+
+ NOTE: use unique addresses for the client and server
+
+ - Start the NFS server
+
+ If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA transport module:
+
+ $ modprobe svcrdma
+
+ Regardless of how the server was built (module or built-in), start the
+ server:
+
+ $ /etc/init.d/nfs start
+
+ or
+
+ $ service nfs start
+
+ Instruct the server to listen on the RDMA transport:
+
+ $ echo rdma 20049 > /proc/fs/nfsd/portlist
+
+ - On the client system
+
+ If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA client module:
+
+ $ modprobe xprtrdma.ko
+
+ Regardless of how the client was built (module or built-in), use this
+ command to mount the NFS/RDMA server:
+
+ $ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt
+
+ To verify that the mount is using RDMA, run "cat /proc/mounts" and check
+ the "proto" field for the given mount.
+
+ Congratulations! You're using NFS/RDMA!
diff --git a/Documentation/filesystems/nfs/nfs.txt b/Documentation/filesystems/nfs/nfs.txt
new file mode 100644
index 00000000000..f50f26ce6cd
--- /dev/null
+++ b/Documentation/filesystems/nfs/nfs.txt
@@ -0,0 +1,98 @@
+
+The NFS client
+==============
+
+The NFS version 2 protocol was first documented in RFC1094 (March 1989).
+Since then two more major releases of NFS have been published, with NFSv3
+being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April
+2003).
+
+The Linux NFS client currently supports all the above published versions,
+and work is in progress on adding support for minor version 1 of the NFSv4
+protocol.
+
+The purpose of this document is to provide information on some of the
+upcall interfaces that are used in order to provide the NFS client with
+some of the information that it requires in order to fully comply with
+the NFS spec.
+
+The DNS resolver
+================
+
+NFSv4 allows for one server to refer the NFS client to data that has been
+migrated onto another server by means of the special "fs_locations"
+attribute. See
+ http://tools.ietf.org/html/rfc3530#section-6
+and
+ http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
+
+The fs_locations information can take the form of either an ip address and
+a path, or a DNS hostname and a path. The latter requires the NFS client to
+do a DNS lookup in order to mount the new volume, and hence the need for an
+upcall to allow userland to provide this service.
+
+Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
+/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps:
+
+ (1) The process checks the dns_resolve cache to see if it contains a
+ valid entry. If so, it returns that entry and exits.
+
+ (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
+ (may be changed using the 'nfs.cache_getent' kernel boot parameter)
+ is run, with two arguments:
+ - the cache name, "dns_resolve"
+ - the hostname to resolve
+
+ (3) After looking up the corresponding ip address, the helper script
+ writes the result into the rpc_pipefs pseudo-file
+ '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel'
+ in the following (text) format:
+
+ "<ip address> <hostname> <ttl>\n"
+
+ Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6
+ (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format.
+ <hostname> is identical to the second argument of the helper
+ script, and <ttl> is the 'time to live' of this cache entry (in
+ units of seconds).
+
+ Note: If <ip address> is invalid, say the string "0", then a negative
+ entry is created, which will cause the kernel to treat the hostname
+ as having no valid DNS translation.
+
+
+
+
+A basic sample /sbin/nfs_cache_getent
+=====================================
+
+#!/bin/bash
+#
+ttl=600
+#
+cut=/usr/bin/cut
+getent=/usr/bin/getent
+rpc_pipefs=/var/lib/nfs/rpc_pipefs
+#
+die()
+{
+ echo "Usage: $0 cache_name entry_name"
+ exit 1
+}
+
+[ $# -lt 2 ] && die
+cachename="$1"
+cache_path=${rpc_pipefs}/cache/${cachename}/channel
+
+case "${cachename}" in
+ dns_resolve)
+ name="$2"
+ result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
+ [ -z "${result}" ] && result="0"
+ ;;
+ *)
+ die
+ ;;
+esac
+echo "${result} ${name} ${ttl}" >${cache_path}
+
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
new file mode 100644
index 00000000000..092fad92a3f
--- /dev/null
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -0,0 +1,208 @@
+NFSv4.1 Server Implementation
+
+Server support for minorversion 1 can be controlled using the
+/proc/fs/nfsd/versions control file. The string output returned
+by reading this file will contain either "+4.1" or "-4.1"
+correspondingly.
+
+Currently, server support for minorversion 1 is disabled by default.
+It can be enabled at run time by writing the string "+4.1" to
+the /proc/fs/nfsd/versions control file. Note that to write this
+control file, the nfsd service must be taken down. Use your user-mode
+nfs-utils to set this up; see rpc.nfsd(8)
+
+(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and
+"-4", respectively. Therefore, code meant to work on both new and old
+kernels must turn 4.1 on or off *before* turning support for version 4
+on or off; rpc.nfsd does this correctly.)
+
+The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
+on RFC 5661.
+
+From the many new features in NFSv4.1 the current implementation
+focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
+"exactly once" semantics and better control and throttling of the
+resources allocated for each client.
+
+Other NFSv4.1 features, Parallel NFS operations in particular,
+are still under development out of tree.
+See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
+for more information.
+
+The current implementation is intended for developers only: while it
+does support ordinary file operations on clients we have tested against
+(including the linux client), it is incomplete in ways which may limit
+features unexpectedly, cause known bugs in rare cases, or cause
+interoperability problems with future clients. Known issues:
+
+ - gss support is questionable: currently mounts with kerberos
+ from a linux client are possible, but we aren't really
+ conformant with the spec (for example, we don't use kerberos
+ on the backchannel correctly).
+ - Incomplete backchannel support: incomplete backchannel gss
+ support and no support for BACKCHANNEL_CTL mean that
+ callbacks (hence delegations and layouts) may not be
+ available and clients confused by the incomplete
+ implementation may fail.
+ - We do not support SSV, which provides security for shared
+ client-server state (thus preventing unauthorized tampering
+ with locks and opens, for example). It is mandatory for
+ servers to support this, though no clients use it yet.
+ - Mandatory operations which we do not support, such as
+ DESTROY_CLIENTID, are not currently used by clients, but will be
+ (and the spec recommends their uses in common cases), and
+ clients should not be expected to know how to recover from the
+ case where they are not supported. This will eventually cause
+ interoperability failures.
+
+In addition, some limitations are inherited from the current NFSv4
+implementation:
+
+ - Incomplete delegation enforcement: if a file is renamed or
+ unlinked by a local process, a client holding a delegation may
+ continue to indefinitely allow opens of the file under the old
+ name.
+
+The table below, taken from the NFSv4.1 document, lists
+the operations that are mandatory to implement (REQ), optional
+(OPT), and NFSv4.0 operations that are required not to implement (MNI)
+in minor version 1. The first column indicates the operations that
+are not supported yet by the linux server implementation.
+
+The OPTIONAL features identified and their abbreviations are as follows:
+ pNFS Parallel NFS
+ FDELG File Delegations
+ DDELG Directory Delegations
+
+The following abbreviations indicate the linux server implementation status.
+ I Implemented NFSv4.1 operations.
+ NS Not Supported.
+ NS* unimplemented optional feature.
+ P pNFS features implemented out of tree.
+ PNS pNFS features that are not supported yet (out of tree).
+
+Operations
+
+ +----------------------+------------+--------------+----------------+
+ | Operation | REQ, REC, | Feature | Definition |
+ | | OPT, or | (REQ, REC, | |
+ | | MNI | or OPT) | |
+ +----------------------+------------+--------------+----------------+
+ | ACCESS | REQ | | Section 18.1 |
+NS | BACKCHANNEL_CTL | REQ | | Section 18.33 |
+I | BIND_CONN_TO_SESSION | REQ | | Section 18.34 |
+ | CLOSE | REQ | | Section 18.2 |
+ | COMMIT | REQ | | Section 18.3 |
+ | CREATE | REQ | | Section 18.4 |
+I | CREATE_SESSION | REQ | | Section 18.36 |
+NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 |
+ | DELEGRETURN | OPT | FDELG, | Section 18.6 |
+ | | | DDELG, pNFS | |
+ | | | (REQ) | |
+NS | DESTROY_CLIENTID | REQ | | Section 18.50 |
+I | DESTROY_SESSION | REQ | | Section 18.37 |
+I | EXCHANGE_ID | REQ | | Section 18.35 |
+I | FREE_STATEID | REQ | | Section 18.38 |
+ | GETATTR | REQ | | Section 18.7 |
+P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
+P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
+ | GETFH | REQ | | Section 18.8 |
+NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
+P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
+P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
+P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
+ | LINK | OPT | | Section 18.9 |
+ | LOCK | REQ | | Section 18.10 |
+ | LOCKT | REQ | | Section 18.11 |
+ | LOCKU | REQ | | Section 18.12 |
+ | LOOKUP | REQ | | Section 18.13 |
+ | LOOKUPP | REQ | | Section 18.14 |
+ | NVERIFY | REQ | | Section 18.15 |
+ | OPEN | REQ | | Section 18.16 |
+NS*| OPENATTR | OPT | | Section 18.17 |
+ | OPEN_CONFIRM | MNI | | N/A |
+ | OPEN_DOWNGRADE | REQ | | Section 18.18 |
+ | PUTFH | REQ | | Section 18.19 |
+ | PUTPUBFH | REQ | | Section 18.20 |
+ | PUTROOTFH | REQ | | Section 18.21 |
+ | READ | REQ | | Section 18.22 |
+ | READDIR | REQ | | Section 18.23 |
+ | READLINK | OPT | | Section 18.24 |
+ | RECLAIM_COMPLETE | REQ | | Section 18.51 |
+ | RELEASE_LOCKOWNER | MNI | | N/A |
+ | REMOVE | REQ | | Section 18.25 |
+ | RENAME | REQ | | Section 18.26 |
+ | RENEW | MNI | | N/A |
+ | RESTOREFH | REQ | | Section 18.27 |
+ | SAVEFH | REQ | | Section 18.28 |
+ | SECINFO | REQ | | Section 18.29 |
+I | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, |
+ | | | layout (REQ) | Section 13.12 |
+I | SEQUENCE | REQ | | Section 18.46 |
+ | SETATTR | REQ | | Section 18.30 |
+ | SETCLIENTID | MNI | | N/A |
+ | SETCLIENTID_CONFIRM | MNI | | N/A |
+NS | SET_SSV | REQ | | Section 18.47 |
+I | TEST_STATEID | REQ | | Section 18.48 |
+ | VERIFY | REQ | | Section 18.31 |
+NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 |
+ | WRITE | REQ | | Section 18.32 |
+
+Callback Operations
+
+ +-------------------------+-----------+-------------+---------------+
+ | Operation | REQ, REC, | Feature | Definition |
+ | | OPT, or | (REQ, REC, | |
+ | | MNI | or OPT) | |
+ +-------------------------+-----------+-------------+---------------+
+ | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
+P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
+NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
+P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
+NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
+NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
+ | CB_RECALL | OPT | FDELG, | Section 20.2 |
+ | | | DDELG, pNFS | |
+ | | | (REQ) | |
+NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 |
+ | | | DDELG, pNFS | |
+ | | | (REQ) | |
+NS | CB_RECALL_SLOT | REQ | | Section 20.8 |
+NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 |
+ | | | (REQ) | |
+I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 |
+ | | | DDELG, pNFS | |
+ | | | (REQ) | |
+NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
+ | | | DDELG, pNFS | |
+ | | | (REQ) | |
+ +-------------------------+-----------+-------------+---------------+
+
+Implementation notes:
+
+DELEGPURGE:
+* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or
+ CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that
+ persist across client reboots). Thus we need not implement this for
+ now.
+
+EXCHANGE_ID:
+* only SP4_NONE state protection supported
+* implementation ids are ignored
+
+CREATE_SESSION:
+* backchannel attributes are ignored
+* backchannel security parameters are ignored
+
+SEQUENCE:
+* no support for dynamic slot table renegotiation (optional)
+
+Nonstandard compound limitations:
+* No support for a sessions fore channel RPC compound that requires both a
+ ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
+ fail to live up to the promise we made in CREATE_SESSION fore channel
+ negotiation.
+* No more than one IO operation (read, write, readdir) allowed per
+ compound.
+
+See also http://wiki.linux-nfs.org/wiki/index.php/Server_4.0_and_4.1_issues.
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
new file mode 100644
index 00000000000..ffdd9d866ad
--- /dev/null
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -0,0 +1,294 @@
+Mounting the root filesystem via NFS (nfsroot)
+===============================================
+
+Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
+Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
+Updated 2006 by Horms <horms@verge.net.au>
+
+
+
+In order to use a diskless system, such as an X-terminal or printer server
+for example, it is necessary for the root filesystem to be present on a
+non-disk device. This may be an initramfs (see Documentation/filesystems/
+ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/initrd.txt) or a
+filesystem mounted via NFS. The following text describes on how to use NFS
+for the root filesystem. For the rest of this text 'client' means the
+diskless system, and 'server' means the NFS server.
+
+
+
+
+1.) Enabling nfsroot capabilities
+ -----------------------------
+
+In order to use nfsroot, NFS client support needs to be selected as
+built-in during configuration. Once this has been selected, the nfsroot
+option will become available, which should also be selected.
+
+In the networking options, kernel level autoconfiguration can be selected,
+along with the types of autoconfiguration to support. Selecting all of
+DHCP, BOOTP and RARP is safe.
+
+
+
+
+2.) Kernel command line
+ -------------------
+
+When the kernel has been loaded by a boot loader (see below) it needs to be
+told what root fs device to use. And in the case of nfsroot, where to find
+both the server and the name of the directory on the server to mount as root.
+This can be established using the following kernel command line parameters:
+
+
+root=/dev/nfs
+
+ This is necessary to enable the pseudo-NFS-device. Note that it's not a
+ real device but just a synonym to tell the kernel to use NFS instead of
+ a real device.
+
+
+nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+
+ If the `nfsroot' parameter is NOT given on the command line,
+ the default "/tftpboot/%s" will be used.
+
+ <server-ip> Specifies the IP address of the NFS server.
+ The default address is determined by the `ip' parameter
+ (see below). This parameter allows the use of different
+ servers for IP autoconfiguration and NFS.
+
+ <root-dir> Name of the directory on the server to mount as root.
+ If there is a "%s" token in the string, it will be
+ replaced by the ASCII-representation of the client's
+ IP address.
+
+ <nfs-options> Standard NFS options. All options are separated by commas.
+ The following defaults are used:
+ port = as given by server portmap daemon
+ rsize = 4096
+ wsize = 4096
+ timeo = 7
+ retrans = 3
+ acregmin = 3
+ acregmax = 60
+ acdirmin = 30
+ acdirmax = 60
+ flags = hard, nointr, noposix, cto, ac
+
+
+ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
+
+ This parameter tells the kernel how to configure IP addresses of devices
+ and also how to set up the IP routing table. It was originally called
+ `nfsaddrs', but now the boot-time IP configuration works independently of
+ NFS, so it was renamed to `ip' and the old name remained as an alias for
+ compatibility reasons.
+
+ If this parameter is missing from the kernel command line, all fields are
+ assumed to be empty, and the defaults mentioned below apply. In general
+ this means that the kernel tries to configure everything using
+ autoconfiguration.
+
+ The <autoconf> parameter can appear alone as the value to the `ip'
+ parameter (without all the ':' characters before). If the value is
+ "ip=off" or "ip=none", no autoconfiguration will take place, otherwise
+ autoconfiguration will take place. The most common way to use this
+ is "ip=dhcp".
+
+ <client-ip> IP address of the client.
+
+ Default: Determined using autoconfiguration.
+
+ <server-ip> IP address of the NFS server. If RARP is used to determine
+ the client address and this parameter is NOT empty only
+ replies from the specified server are accepted.
+
+ Only required for NFS root. That is autoconfiguration
+ will not be triggered if it is missing and NFS root is not
+ in operation.
+
+ Default: Determined using autoconfiguration.
+ The address of the autoconfiguration server is used.
+
+ <gw-ip> IP address of a gateway if the server is on a different subnet.
+
+ Default: Determined using autoconfiguration.
+
+ <netmask> Netmask for local network interface. If unspecified
+ the netmask is derived from the client IP address assuming
+ classful addressing.
+
+ Default: Determined using autoconfiguration.
+
+ <hostname> Name of the client. May be supplied by autoconfiguration,
+ but its absence will not trigger autoconfiguration.
+ If specified and DHCP is used, the user provided hostname will
+ be carried in the DHCP request to hopefully update DNS record.
+
+ Default: Client IP address is used in ASCII notation.
+
+ <device> Name of network device to use.
+
+ Default: If the host only has one device, it is used.
+ Otherwise the device is determined using
+ autoconfiguration. This is done by sending
+ autoconfiguration requests out of all devices,
+ and using the device that received the first reply.
+
+ <autoconf> Method to use for autoconfiguration. In the case of options
+ which specify multiple autoconfiguration protocols,
+ requests are sent using all protocols, and the first one
+ to reply is used.
+
+ Only autoconfiguration protocols that have been compiled
+ into the kernel will be used, regardless of the value of
+ this option.
+
+ off or none: don't use autoconfiguration
+ (do static IP assignment instead)
+ on or any: use any protocol available in the kernel
+ (default)
+ dhcp: use DHCP
+ bootp: use BOOTP
+ rarp: use RARP
+ both: use both BOOTP and RARP but not DHCP
+ (old option kept for backwards compatibility)
+
+ Default: any
+
+
+nfsrootdebug
+
+ This parameter enables debugging messages to appear in the kernel
+ log at boot time so that administrators can verify that the correct
+ NFS mount options, server address, and root path are passed to the
+ NFS client.
+
+
+rdinit=<executable file>
+
+ To specify which file contains the program that starts system
+ initialization, administrators can use this command line parameter.
+ The default value of this parameter is "/init". If the specified
+ file exists and the kernel can execute it, root filesystem related
+ kernel command line parameters, including `nfsroot=', are ignored.
+
+ A description of the process of mounting the root file system can be
+ found in:
+
+ Documentation/early-userspace/README
+
+
+
+
+3.) Boot Loader
+ ----------
+
+To get the kernel into memory different approaches can be used.
+They depend on various facilities being available:
+
+
+3.1) Booting from a floppy using syslinux
+
+ When building kernels, an easy way to create a boot floppy that uses
+ syslinux is to use the zdisk or bzdisk make targets which use zimage
+ and bzimage images respectively. Both targets accept the
+ FDARGS parameter which can be used to set the kernel command line.
+
+ e.g.
+ make bzdisk FDARGS="root=/dev/nfs"
+
+ Note that the user running this command will need to have
+ access to the floppy drive device, /dev/fd0
+
+ For more information on syslinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+ N.B: Previously it was possible to write a kernel directly to
+ a floppy using dd, configure the boot device using rdev, and
+ boot using the resulting floppy. Linux no longer supports this
+ method of booting.
+
+3.2) Booting from a cdrom using isolinux
+
+ When building kernels, an easy way to create a bootable cdrom that
+ uses isolinux is to use the isoimage target which uses a bzimage
+ image. Like zdisk and bzdisk, this target accepts the FDARGS
+ parameter which can be used to set the kernel command line.
+
+ e.g.
+ make isoimage FDARGS="root=/dev/nfs"
+
+ The resulting iso image will be arch/<ARCH>/boot/image.iso
+ This can be written to a cdrom using a variety of tools including
+ cdrecord.
+
+ e.g.
+ cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+3.2) Using LILO
+ When using LILO all the necessary command line parameters may be
+ specified using the 'append=' directive in the LILO configuration
+ file.
+
+ However, to use the 'root=' directive you also need to create
+ a dummy root device, which may be removed after LILO is run.
+
+ mknod /dev/boot255 c 0 255
+
+ For information on configuring LILO, please refer to its documentation.
+
+3.3) Using GRUB
+ When using GRUB, kernel parameter are simply appended after the kernel
+ specification: kernel <kernel> <parameters>
+
+3.4) Using loadlin
+ loadlin may be used to boot Linux from a DOS command prompt without
+ requiring a local hard disk to mount as root. This has not been
+ thoroughly tested by the authors of this document, but in general
+ it should be possible configure the kernel command line similarly
+ to the configuration of LILO.
+
+ Please refer to the loadlin documentation for further information.
+
+3.5) Using a boot ROM
+ This is probably the most elegant way of booting a diskless client.
+ With a boot ROM the kernel is loaded using the TFTP protocol. The
+ authors of this document are not aware of any no commercial boot
+ ROMs that support booting Linux over the network. However, there
+ are two free implementations of a boot ROM, netboot-nfs and
+ etherboot, both of which are available on sunsite.unc.edu, and both
+ of which contain everything you need to boot a diskless Linux client.
+
+3.6) Using pxelinux
+ Pxelinux may be used to boot linux using the PXE boot loader
+ which is present on many modern network cards.
+
+ When using pxelinux, the kernel image is specified using
+ "kernel <relative-path-below /tftpboot>". The nfsroot parameters
+ are passed to the kernel by adding them to the "append" line.
+ It is common to use serial console in conjunction with pxeliunx,
+ see Documentation/serial-console.txt for more information.
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+
+
+
+4.) Credits
+ -------
+
+ The nfsroot code in the kernel and the RARP support have been written
+ by Gero Kuhlmann <gero@gkminix.han.de>.
+
+ The rest of the IP layer autoconfiguration code has been written
+ by Martin Mares <mj@atrey.karlin.mff.cuni.cz>.
+
+ In order to write the initial version of nfsroot I would like to thank
+ Jens-Uwe Mager <jum@anubis.han.de> for his help.
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644
index 00000000000..983e14abe7e
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,55 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches. We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices. Each device
+can be referenced by multiple layouts. To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts). Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type. The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache). The cache itself is referenced across each
+mount. The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure. The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base. A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache. Its reference is held over the lifetime of the deviceid
+pointing to it.
+
+lseg
+----
+lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
+bit which holds it in the pnfs_layout_hdr's list. When the final lseg
+is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
+bit is set, preventing any new lsegs from being added.
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
new file mode 100644
index 00000000000..ebcaaee2161
--- /dev/null
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -0,0 +1,202 @@
+ This document gives a brief introduction to the caching
+mechanisms in the sunrpc layer that is used, in particular,
+for NFS authentication.
+
+CACHES
+======
+The caching replaces the old exports table and allows for
+a wide variety of values to be caches.
+
+There are a number of caches that are similar in structure though
+quite possibly very different in content and use. There is a corpus
+of common code for managing these caches.
+
+Examples of caches that are likely to be needed are:
+ - mapping from IP address to client name
+ - mapping from client name and filesystem to export options
+ - mapping from UID to list of GIDs, to work around NFS's limitation
+ of 16 gids.
+ - mappings between local UID/GID and remote UID/GID for sites that
+ do not have uniform uid assignment
+ - mapping from network identify to public key for crypto authentication.
+
+The common code handles such things as:
+ - general cache lookup with correct locking
+ - supporting 'NEGATIVE' as well as positive entries
+ - allowing an EXPIRED time on cache items, and removing
+ items after they expire, and are no longer in-use.
+ - making requests to user-space to fill in cache entries
+ - allowing user-space to directly set entries in the cache
+ - delaying RPC requests that depend on as-yet incomplete
+ cache entries, and replaying those requests when the cache entry
+ is complete.
+ - clean out old entries as they expire.
+
+Creating a Cache
+----------------
+
+1/ A cache needs a datum to store. This is in the form of a
+ structure definition that must contain a
+ struct cache_head
+ as an element, usually the first.
+ It will also contain a key and some content.
+ Each cache element is reference counted and contains
+ expiry and update times for use in cache management.
+2/ A cache needs a "cache_detail" structure that
+ describes the cache. This stores the hash table, some
+ parameters for cache management, and some operations detailing how
+ to work with particular cache items.
+ The operations requires are:
+ struct cache_head *alloc(void)
+ This simply allocates appropriate memory and returns
+ a pointer to the cache_detail embedded within the
+ structure
+ void cache_put(struct kref *)
+ This is called when the last reference to an item is
+ dropped. The pointer passed is to the 'ref' field
+ in the cache_head. cache_put should release any
+ references create by 'cache_init' and, if CACHE_VALID
+ is set, any references created by cache_update.
+ It should then release the memory allocated by
+ 'alloc'.
+ int match(struct cache_head *orig, struct cache_head *new)
+ test if the keys in the two structures match. Return
+ 1 if they do, 0 if they don't.
+ void init(struct cache_head *orig, struct cache_head *new)
+ Set the 'key' fields in 'new' from 'orig'. This may
+ include taking references to shared objects.
+ void update(struct cache_head *orig, struct cache_head *new)
+ Set the 'content' fileds in 'new' from 'orig'.
+ int cache_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+ Optional. Used to provide a /proc file that lists the
+ contents of a cache. This should show one item,
+ usually on just one line.
+ int cache_request(struct cache_detail *cd, struct cache_head *h,
+ char **bpp, int *blen)
+ Format a request to be send to user-space for an item
+ to be instantiated. *bpp is a buffer of size *blen.
+ bpp should be moved forward over the encoded message,
+ and *blen should be reduced to show how much free
+ space remains. Return 0 on success or <0 if not
+ enough room or other problem.
+ int cache_parse(struct cache_detail *cd, char *buf, int len)
+ A message from user space has arrived to fill out a
+ cache entry. It is in 'buf' of length 'len'.
+ cache_parse should parse this, find the item in the
+ cache with sunrpc_cache_lookup, and update the item
+ with sunrpc_cache_update.
+
+
+3/ A cache needs to be registered using cache_register(). This
+ includes it on a list of caches that will be regularly
+ cleaned to discard old data.
+
+Using a cache
+-------------
+
+To find a value in a cache, call sunrpc_cache_lookup passing a pointer
+to the cache_head in a sample item with the 'key' fields filled in.
+This will be passed to ->match to identify the target entry. If no
+entry is found, a new entry will be create, added to the cache, and
+marked as not containing valid data.
+
+The item returned is typically passed to cache_check which will check
+if the data is valid, and may initiate an up-call to get fresh data.
+cache_check will return -ENOENT in the entry is negative or if an up
+call is needed but not possible, -EAGAIN if an upcall is pending,
+or 0 if the data is valid;
+
+cache_check can be passed a "struct cache_req *". This structure is
+typically embedded in the actual request and can be used to create a
+deferred copy of the request (struct cache_deferred_req). This is
+done when the found cache item is not uptodate, but the is reason to
+believe that userspace might provide information soon. When the cache
+item does become valid, the deferred copy of the request will be
+revisited (->revisit). It is expected that this method will
+reschedule the request for processing.
+
+The value returned by sunrpc_cache_lookup can also be passed to
+sunrpc_cache_update to set the content for the item. A second item is
+passed which should hold the content. If the item found by _lookup
+has valid data, then it is discarded and a new item is created. This
+saves any user of an item from worrying about content changing while
+it is being inspected. If the item found by _lookup does not contain
+valid data, then the content is copied across and CACHE_VALID is set.
+
+Populating a cache
+------------------
+
+Each cache has a name, and when the cache is registered, a directory
+with that name is created in /proc/net/rpc
+
+This directory contains a file called 'channel' which is a channel
+for communicating between kernel and user for populating the cache.
+This directory may later contain other files of interacting
+with the cache.
+
+The 'channel' works a bit like a datagram socket. Each 'write' is
+passed as a whole to the cache for parsing and interpretation.
+Each cache can treat the write requests differently, but it is
+expected that a message written will contain:
+ - a key
+ - an expiry time
+ - a content.
+with the intention that an item in the cache with the give key
+should be create or updated to have the given content, and the
+expiry time should be set on that item.
+
+Reading from a channel is a bit more interesting. When a cache
+lookup fails, or when it succeeds but finds an entry that may soon
+expire, a request is lodged for that cache item to be updated by
+user-space. These requests appear in the channel file.
+
+Successive reads will return successive requests.
+If there are no more requests to return, read will return EOF, but a
+select or poll for read will block waiting for another request to be
+added.
+
+Thus a user-space helper is likely to:
+ open the channel.
+ select for readable
+ read a request
+ write a response
+ loop.
+
+If it dies and needs to be restarted, any requests that have not been
+answered will still appear in the file and will be read by the new
+instance of the helper.
+
+Each cache should define a "cache_parse" method which takes a message
+written from user-space and processes it. It should return an error
+(which propagates back to the write syscall) or 0.
+
+Each cache should also define a "cache_request" method which
+takes a cache item and encodes a request into the buffer
+provided.
+
+Note: If a cache has no active readers on the channel, and has had not
+active readers for more than 60 seconds, further requests will not be
+added to the channel but instead all lookups that do not find a valid
+entry will fail. This is partly for backward compatibility: The
+previous nfs exports table was deemed to be authoritative and a
+failed lookup meant a definite 'no'.
+
+request/response format
+-----------------------
+
+While each cache is free to use its own format for requests
+and responses over channel, the following is recommended as
+appropriate and support routines are available to help:
+Each request or response record should be printable ASCII
+with precisely one newline character which should be at the end.
+Fields within the record should be separated by spaces, normally one.
+If spaces, newlines, or nul characters are needed in a field they
+much be quoted. two mechanisms are available:
+1/ If a field begins '\x' then it must contain an even number of
+ hex digits, and pairs of these digits provide the bytes in the
+ field.
+2/ otherwise a \ in the field must be followed by 3 octal digits
+ which give the code for a byte. Other characters are treated
+ as them selves. At the very least, space, newline, nul, and
+ '\' must be quoted in this way.
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
new file mode 100644
index 00000000000..873a2ab2e9f
--- /dev/null
+++ b/Documentation/filesystems/nilfs2.txt
@@ -0,0 +1,208 @@
+NILFS2
+------
+
+NILFS2 is a log-structured file system (LFS) supporting continuous
+snapshotting. In addition to versioning capability of the entire file
+system, users can even restore files mistakenly overwritten or
+destroyed just a few seconds ago. Since NILFS2 can keep consistency
+like conventional LFS, it achieves quick recovery after system
+crashes.
+
+NILFS2 creates a number of checkpoints every few seconds or per
+synchronous write basis (unless there is no change). Users can select
+significant versions among continuously created checkpoints, and can
+change them into snapshots which will be preserved until they are
+changed back to checkpoints.
+
+There is no limit on the number of snapshots until the volume gets
+full. Each snapshot is mountable as a read-only file system
+concurrently with its writable mount, and this feature is convenient
+for online backup.
+
+The userland tools are included in nilfs-utils package, which is
+available from the following download page. At least "mkfs.nilfs2",
+"mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called
+cleaner or garbage collector) are required. Details on the tools are
+described in the man pages included in the package.
+
+Project web page: http://www.nilfs.org/en/
+Download page: http://www.nilfs.org/en/download.html
+Git tree web page: http://www.nilfs.org/git/
+List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
+
+Caveats
+=======
+
+Features which NILFS2 does not support yet:
+
+ - atime
+ - extended attributes
+ - POSIX ACLs
+ - quotas
+ - fsck
+ - defragmentation
+
+Mount options
+=============
+
+NILFS2 supports the following mount options:
+(*) == default
+
+barrier(*) This enables/disables the use of write barriers. This
+nobarrier requires an IO stack which can support barriers, and
+ if nilfs gets an error on a barrier write, it will
+ disable again with a warning.
+errors=continue Keep going on a filesystem error.
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+cp=n Specify the checkpoint-number of the snapshot to be
+ mounted. Checkpoints and snapshots are listed by lscp
+ user command. Only the checkpoints marked as snapshot
+ are mountable with this option. Snapshot is read-only,
+ so a read-only mount option must be specified together.
+order=relaxed(*) Apply relaxed order semantics that allows modified data
+ blocks to be written to disk without making a
+ checkpoint if no metadata update is going. This mode
+ is equivalent to the ordered data mode of the ext3
+ filesystem except for the updates on data blocks still
+ conserve atomicity. This will improve synchronous
+ write performance for overwriting.
+order=strict Apply strict in-order semantics that preserves sequence
+ of all file operations including overwriting of data
+ blocks. That means, it is guaranteed that no
+ overtaking of events occurs in the recovered file
+ system after a crash.
+norecovery Disable recovery of the filesystem on mount.
+ This disables every write access on the device for
+ read-only mounts or snapshots. This option will fail
+ for r/w mounts on an unclean volume.
+discard This enables/disables the use of discard/TRIM commands.
+nodiscard(*) The discard/TRIM commands are sent to the underlying
+ block device when blocks are freed. This is useful
+ for SSD devices and sparse/thinly-provisioned LUNs.
+
+NILFS2 usage
+============
+
+To use nilfs2 as a local file system, simply:
+
+ # mkfs -t nilfs2 /dev/block_device
+ # mount -t nilfs2 /dev/block_device /dir
+
+This will also invoke the cleaner through the mount helper program
+(mount.nilfs2).
+
+Checkpoints and snapshots are managed by the following commands.
+Their manpages are included in the nilfs-utils package above.
+
+ lscp list checkpoints or snapshots.
+ mkcp make a checkpoint or a snapshot.
+ chcp change an existing checkpoint to a snapshot or vice versa.
+ rmcp invalidate specified checkpoint(s).
+
+To mount a snapshot,
+
+ # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir
+
+where <cno> is the checkpoint number of the snapshot.
+
+To unmount the NILFS2 mount point or snapshot, simply:
+
+ # umount /dir
+
+Then, the cleaner daemon is automatically shut down by the umount
+helper program (umount.nilfs2).
+
+Disk format
+===========
+
+A nilfs2 volume is equally divided into a number of segments except
+for the super block (SB) and segment #0. A segment is the container
+of logs. Each log is composed of summary information blocks, payload
+blocks, and an optional super root block (SR):
+
+ ______________________________________________________
+ | |SB| | Segment | Segment | Segment | ... | Segment | |
+ |_|__|_|____0____|____1____|____2____|_____|____N____|_|
+ 0 +1K +4K +8M +16M +24M +(8MB x N)
+ . . (Typical offsets for 4KB-block)
+ . .
+ .______________________.
+ | log | log |... | log |
+ |__1__|__2__|____|__m__|
+ . .
+ . .
+ . .
+ .______________________________.
+ | Summary | Payload blocks |SR|
+ |_blocks__|_________________|__|
+
+The payload blocks are organized per file, and each file consists of
+data blocks and B-tree node blocks:
+
+ |<--- File-A --->|<--- File-B --->|
+ _______________________________________________________________
+ | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ...
+ _|_____________|_______________|_____________|_______________|_
+
+
+Since only the modified blocks are written in the log, it may have
+files without data blocks or B-tree node blocks.
+
+The organization of the blocks is recorded in the summary information
+blocks, which contains a header structure (nilfs_segment_summary), per
+file structures (nilfs_finfo), and per block structures (nilfs_binfo):
+
+ _________________________________________________________________________
+ | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |...
+ |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___
+
+
+The logs include regular files, directory files, symbolic link files
+and several meta data files. The mata data files are the files used
+to maintain file system meta data. The current version of NILFS2 uses
+the following meta data files:
+
+ 1) Inode file (ifile) -- Stores on-disk inodes
+ 2) Checkpoint file (cpfile) -- Stores checkpoints
+ 3) Segment usage file (sufile) -- Stores allocation state of segments
+ 4) Data address translation file -- Maps virtual block numbers to usual
+ (DAT) block numbers. This file serves to
+ make on-disk blocks relocatable.
+
+The following figure shows a typical organization of the logs:
+
+ _________________________________________________________________________
+ | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR|
+ |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__|
+
+
+To stride over segment boundaries, this sequence of files may be split
+into multiple logs. The sequence of logs that should be treated as
+logically one log, is delimited with flags marked in the segment
+summary. The recovery code of nilfs2 looks this boundary information
+to ensure atomicity of updates.
+
+The super root block is inserted for every checkpoints. It includes
+three special inodes, inodes for the DAT, cpfile, and sufile. Inodes
+of regular files, directories, symlinks and other special files, are
+included in the ifile. The inode of ifile itself is included in the
+corresponding checkpoint entry in the cpfile. Thus, the hierarchy
+among NILFS2 files can be depicted as follows:
+
+ Super block (SB)
+ |
+ v
+ Super root block (the latest cno=xx)
+ |-- DAT
+ |-- sufile
+ `-- cpfile
+ |-- ifile (cno=c1)
+ |-- ifile (cno=c2) ---- file (ino=i1)
+ : : |-- file (ino=i2)
+ `-- ifile (cno=xx) |-- file (ino=i3)
+ : :
+ `-- file (ino=yy)
+ ( regular file, directory, or symlink )
+
+For detail on the format of each file, please see include/linux/nilfs2_fs.h.
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
new file mode 100644
index 00000000000..791af8dac06
--- /dev/null
+++ b/Documentation/filesystems/ntfs.txt
@@ -0,0 +1,721 @@
+The Linux NTFS filesystem driver
+================================
+
+
+Table of contents
+=================
+
+- Overview
+- Web site
+- Features
+- Supported mount options
+- Known bugs and (mis-)features
+- Using NTFS volume and stripe sets
+ - The Device-Mapper driver
+ - The Software RAID / MD driver
+ - Limitations when using the MD driver
+- ChangeLog
+
+
+Overview
+========
+
+Linux-NTFS comes with a number of user-space programs known as ntfsprogs.
+These include mkntfs, a full-featured ntfs filesystem format utility,
+ntfsundelete used for recovering files that were unintentionally deleted
+from an NTFS volume and ntfsresize which is used to resize an NTFS partition.
+See the web site for more information.
+
+To mount an NTFS 1.2/3.x (Windows NT4/2000/XP/2003) volume, use the file
+system type 'ntfs'. The driver currently supports read-only mode (with no
+fault-tolerance, encryption or journalling) and very limited, but safe, write
+support.
+
+For fault tolerance and raid support (i.e. volume and stripe sets), you can
+use the kernel's Software RAID / MD driver. See section "Using Software RAID
+with NTFS" for details.
+
+
+Web site
+========
+
+There is plenty of additional information on the linux-ntfs web site
+at http://www.linux-ntfs.org/
+
+The web site has a lot of additional information, such as a comprehensive
+FAQ, documentation on the NTFS on-disk format, information on the Linux-NTFS
+userspace utilities, etc.
+
+
+Features
+========
+
+- This is a complete rewrite of the NTFS driver that used to be in the 2.4 and
+ earlier kernels. This new driver implements NTFS read support and is
+ functionally equivalent to the old ntfs driver and it also implements limited
+ write support. The biggest limitation at present is that files/directories
+ cannot be created or deleted. See below for the list of write features that
+ are so far supported. Another limitation is that writing to compressed files
+ is not implemented at all. Also, neither read nor write access to encrypted
+ files is so far implemented.
+- The new driver has full support for sparse files on NTFS 3.x volumes which
+ the old driver isn't happy with.
+- The new driver supports execution of binaries due to mmap() now being
+ supported.
+- The new driver supports loopback mounting of files on NTFS which is used by
+ some Linux distributions to enable the user to run Linux from an NTFS
+ partition by creating a large file while in Windows and then loopback
+ mounting the file while in Linux and creating a Linux filesystem on it that
+ is used to install Linux on it.
+- A comparison of the two drivers using:
+ time find . -type f -exec md5sum "{}" \;
+ run three times in sequence with each driver (after a reboot) on a 1.4GiB
+ NTFS partition, showed the new driver to be 20% faster in total time elapsed
+ (from 9:43 minutes on average down to 7:53). The time spent in user space
+ was unchanged but the time spent in the kernel was decreased by a factor of
+ 2.5 (from 85 CPU seconds down to 33).
+- The driver does not support short file names in general. For backwards
+ compatibility, we implement access to files using their short file names if
+ they exist. The driver will not create short file names however, and a
+ rename will discard any existing short file name.
+- The new driver supports exporting of mounted NTFS volumes via NFS.
+- The new driver supports async io (aio).
+- The new driver supports fsync(2), fdatasync(2), and msync(2).
+- The new driver supports readv(2) and writev(2).
+- The new driver supports access time updates (including mtime and ctime).
+- The new driver supports truncate(2) and open(2) with O_TRUNC. But at present
+ only very limited support for highly fragmented files, i.e. ones which have
+ their data attribute split across multiple extents, is included. Another
+ limitation is that at present truncate(2) will never create sparse files,
+ since to mark a file sparse we need to modify the directory entry for the
+ file and we do not implement directory modifications yet.
+- The new driver supports write(2) which can both overwrite existing data and
+ extend the file size so that you can write beyond the existing data. Also,
+ writing into sparse regions is supported and the holes are filled in with
+ clusters. But at present only limited support for highly fragmented files,
+ i.e. ones which have their data attribute split across multiple extents, is
+ included. Another limitation is that write(2) will never create sparse
+ files, since to mark a file sparse we need to modify the directory entry for
+ the file and we do not implement directory modifications yet.
+
+Supported mount options
+=======================
+
+In addition to the generic mount options described by the manual page for the
+mount command (man 8 mount, also see man 5 fstab), the NTFS driver supports the
+following mount options:
+
+iocharset=name Deprecated option. Still supported but please use
+ nls=name in the future. See description for nls=name.
+
+nls=name Character set to use when returning file names.
+ Unlike VFAT, NTFS suppresses names that contain
+ unconvertible characters. Note that most character
+ sets contain insufficient characters to represent all
+ possible Unicode characters that can exist on NTFS.
+ To be sure you are not missing any files, you are
+ advised to use nls=utf8 which is capable of
+ representing all Unicode characters.
+
+utf8=<bool> Option no longer supported. Currently mapped to
+ nls=utf8 but please use nls=utf8 in the future and
+ make sure utf8 is compiled either as module or into
+ the kernel. See description for nls=name.
+
+uid=
+gid=
+umask= Provide default owner, group, and access mode mask.
+ These options work as documented in mount(8). By
+ default, the files/directories are owned by root and
+ he/she has read and write permissions, as well as
+ browse permission for directories. No one else has any
+ access permissions. I.e. the mode on all files is by
+ default rw------- and for directories rwx------, a
+ consequence of the default fmask=0177 and dmask=0077.
+ Using a umask of zero will grant all permissions to
+ everyone, i.e. all files and directories will have mode
+ rwxrwxrwx.
+
+fmask=
+dmask= Instead of specifying umask which applies both to
+ files and directories, fmask applies only to files and
+ dmask only to directories.
+
+sloppy=<BOOL> If sloppy is specified, ignore unknown mount options.
+ Otherwise the default behaviour is to abort mount if
+ any unknown options are found.
+
+show_sys_files=<BOOL> If show_sys_files is specified, show the system files
+ in directory listings. Otherwise the default behaviour
+ is to hide the system files.
+ Note that even when show_sys_files is specified, "$MFT"
+ will not be visible due to bugs/mis-features in glibc.
+ Further, note that irrespective of show_sys_files, all
+ files are accessible by name, i.e. you can always do
+ "ls -l \$UpCase" for example to specifically show the
+ system file containing the Unicode upcase table.
+
+case_sensitive=<BOOL> If case_sensitive is specified, treat all file names as
+ case sensitive and create file names in the POSIX
+ namespace. Otherwise the default behaviour is to treat
+ file names as case insensitive and to create file names
+ in the WIN32/LONG name space. Note, the Linux NTFS
+ driver will never create short file names and will
+ remove them on rename/delete of the corresponding long
+ file name.
+ Note that files remain accessible via their short file
+ name, if it exists. If case_sensitive, you will need
+ to provide the correct case of the short file name.
+
+disable_sparse=<BOOL> If disable_sparse is specified, creation of sparse
+ regions, i.e. holes, inside files is disabled for the
+ volume (for the duration of this mount only). By
+ default, creation of sparse regions is enabled, which
+ is consistent with the behaviour of traditional Unix
+ filesystems.
+
+errors=opt What to do when critical filesystem errors are found.
+ Following values can be used for "opt":
+ continue: DEFAULT, try to clean-up as much as
+ possible, e.g. marking a corrupt inode as
+ bad so it is no longer accessed, and then
+ continue.
+ recover: At present only supported is recovery of
+ the boot sector from the backup copy.
+ If read-only mount, the recovery is done
+ in memory only and not written to disk.
+ Note that the options are additive, i.e. specifying:
+ errors=continue,errors=recover
+ means the driver will attempt to recover and if that
+ fails it will clean-up as much as possible and
+ continue.
+
+mft_zone_multiplier= Set the MFT zone multiplier for the volume (this
+ setting is not persistent across mounts and can be
+ changed from mount to mount but cannot be changed on
+ remount). Values of 1 to 4 are allowed, 1 being the
+ default. The MFT zone multiplier determines how much
+ space is reserved for the MFT on the volume. If all
+ other space is used up, then the MFT zone will be
+ shrunk dynamically, so this has no impact on the
+ amount of free space. However, it can have an impact
+ on performance by affecting fragmentation of the MFT.
+ In general use the default. If you have a lot of small
+ files then use a higher value. The values have the
+ following meaning:
+ Value MFT zone size (% of volume size)
+ 1 12.5%
+ 2 25%
+ 3 37.5%
+ 4 50%
+ Note this option is irrelevant for read-only mounts.
+
+
+Known bugs and (mis-)features
+=============================
+
+- The link count on each directory inode entry is set to 1, due to Linux not
+ supporting directory hard links. This may well confuse some user space
+ applications, since the directory names will have the same inode numbers.
+ This also speeds up ntfs_read_inode() immensely. And we haven't found any
+ problems with this approach so far. If you find a problem with this, please
+ let us know.
+
+
+Please send bug reports/comments/feedback/abuse to the Linux-NTFS development
+list at sourceforge: linux-ntfs-dev@lists.sourceforge.net
+
+
+Using NTFS volume and stripe sets
+=================================
+
+For support of volume and stripe sets, you can either use the kernel's
+Device-Mapper driver or the kernel's Software RAID / MD driver. The former is
+the recommended one to use for linear raid. But the latter is required for
+raid level 5. For striping and mirroring, either driver should work fine.
+
+
+The Device-Mapper driver
+------------------------
+
+You will need to create a table of the components of the volume/stripe set and
+how they fit together and load this into the kernel using the dmsetup utility
+(see man 8 dmsetup).
+
+Linear volume sets, i.e. linear raid, has been tested and works fine. Even
+though untested, there is no reason why stripe sets, i.e. raid level 0, and
+mirrors, i.e. raid level 1 should not work, too. Stripes with parity, i.e.
+raid level 5, unfortunately cannot work yet because the current version of the
+Device-Mapper driver does not support raid level 5. You may be able to use the
+Software RAID / MD driver for raid level 5, see the next section for details.
+
+To create the table describing your volume you will need to know each of its
+components and their sizes in sectors, i.e. multiples of 512-byte blocks.
+
+For NT4 fault tolerant volumes you can obtain the sizes using fdisk. So for
+example if one of your partitions is /dev/hda2 you would do:
+
+$ fdisk -ul /dev/hda
+
+Disk /dev/hda: 81.9 GB, 81964302336 bytes
+255 heads, 63 sectors/track, 9964 cylinders, total 160086528 sectors
+Units = sectors of 1 * 512 = 512 bytes
+
+ Device Boot Start End Blocks Id System
+ /dev/hda1 * 63 4209029 2104483+ 83 Linux
+ /dev/hda2 4209030 37768814 16779892+ 86 NTFS
+ /dev/hda3 37768815 46170809 4200997+ 83 Linux
+
+And you would know that /dev/hda2 has a size of 37768814 - 4209030 + 1 =
+33559785 sectors.
+
+For Win2k and later dynamic disks, you can for example use the ldminfo utility
+which is part of the Linux LDM tools (the latest version at the time of
+writing is linux-ldm-0.0.8.tar.bz2). You can download it from:
+ http://www.linux-ntfs.org/
+Simply extract the downloaded archive (tar xvjf linux-ldm-0.0.8.tar.bz2), go
+into it (cd linux-ldm-0.0.8) and change to the test directory (cd test). You
+will find the precompiled (i386) ldminfo utility there. NOTE: You will not be
+able to compile this yourself easily so use the binary version!
+
+Then you would use ldminfo in dump mode to obtain the necessary information:
+
+$ ./ldminfo --dump /dev/hda
+
+This would dump the LDM database found on /dev/hda which describes all of your
+dynamic disks and all the volumes on them. At the bottom you will see the
+VOLUME DEFINITIONS section which is all you really need. You may need to look
+further above to determine which of the disks in the volume definitions is
+which device in Linux. Hint: Run ldminfo on each of your dynamic disks and
+look at the Disk Id close to the top of the output for each (the PRIVATE HEADER
+section). You can then find these Disk Ids in the VBLK DATABASE section in the
+<Disk> components where you will get the LDM Name for the disk that is found in
+the VOLUME DEFINITIONS section.
+
+Note you will also need to enable the LDM driver in the Linux kernel. If your
+distribution did not enable it, you will need to recompile the kernel with it
+enabled. This will create the LDM partitions on each device at boot time. You
+would then use those devices (for /dev/hda they would be /dev/hda1, 2, 3, etc)
+in the Device-Mapper table.
+
+You can also bypass using the LDM driver by using the main device (e.g.
+/dev/hda) and then using the offsets of the LDM partitions into this device as
+the "Start sector of device" when creating the table. Once again ldminfo would
+give you the correct information to do this.
+
+Assuming you know all your devices and their sizes things are easy.
+
+For a linear raid the table would look like this (note all values are in
+512-byte sectors):
+
+--- cut here ---
+# Offset into Size of this Raid type Device Start sector
+# volume device of device
+0 1028161 linear /dev/hda1 0
+1028161 3903762 linear /dev/hdb2 0
+4931923 2103211 linear /dev/hdc1 0
+--- cut here ---
+
+For a striped volume, i.e. raid level 0, you will need to know the chunk size
+you used when creating the volume. Windows uses 64kiB as the default, so it
+will probably be this unless you changes the defaults when creating the array.
+
+For a raid level 0 the table would look like this (note all values are in
+512-byte sectors):
+
+--- cut here ---
+# Offset Size Raid Number Chunk 1st Start 2nd Start
+# into of the type of size Device in Device in
+# volume volume stripes device device
+0 2056320 striped 2 128 /dev/hda1 0 /dev/hdb1 0
+--- cut here ---
+
+If there are more than two devices, just add each of them to the end of the
+line.
+
+Finally, for a mirrored volume, i.e. raid level 1, the table would look like
+this (note all values are in 512-byte sectors):
+
+--- cut here ---
+# Ofs Size Raid Log Number Region Should Number Source Start Target Start
+# in of the type type of log size sync? of Device in Device in
+# vol volume params mirrors Device Device
+0 2056320 mirror core 2 16 nosync 2 /dev/hda1 0 /dev/hdb1 0
+--- cut here ---
+
+If you are mirroring to multiple devices you can specify further targets at the
+end of the line.
+
+Note the "Should sync?" parameter "nosync" means that the two mirrors are
+already in sync which will be the case on a clean shutdown of Windows. If the
+mirrors are not clean, you can specify the "sync" option instead of "nosync"
+and the Device-Mapper driver will then copy the entirety of the "Source Device"
+to the "Target Device" or if you specified multiple target devices to all of
+them.
+
+Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1),
+and hand it over to dmsetup to work with, like so:
+
+$ dmsetup create myvolume1 /etc/ntfsvolume1
+
+You can obviously replace "myvolume1" with whatever name you like.
+
+If it all worked, you will now have the device /dev/device-mapper/myvolume1
+which you can then just use as an argument to the mount command as usual to
+mount the ntfs volume. For example:
+
+$ mount -t ntfs -o ro /dev/device-mapper/myvolume1 /mnt/myvol1
+
+(You need to create the directory /mnt/myvol1 first and of course you can use
+anything you like instead of /mnt/myvol1 as long as it is an existing
+directory.)
+
+It is advisable to do the mount read-only to see if the volume has been setup
+correctly to avoid the possibility of causing damage to the data on the ntfs
+volume.
+
+
+The Software RAID / MD driver
+-----------------------------
+
+An alternative to using the Device-Mapper driver is to use the kernel's
+Software RAID / MD driver. For which you need to set up your /etc/raidtab
+appropriately (see man 5 raidtab).
+
+Linear volume sets, i.e. linear raid, as well as stripe sets, i.e. raid level
+0, have been tested and work fine (though see section "Limitations when using
+the MD driver with NTFS volumes" especially if you want to use linear raid).
+Even though untested, there is no reason why mirrors, i.e. raid level 1, and
+stripes with parity, i.e. raid level 5, should not work, too.
+
+You have to use the "persistent-superblock 0" option for each raid-disk in the
+NTFS volume/stripe you are configuring in /etc/raidtab as the persistent
+superblock used by the MD driver would damage the NTFS volume.
+
+Windows by default uses a stripe chunk size of 64k, so you probably want the
+"chunk-size 64k" option for each raid-disk, too.
+
+For example, if you have a stripe set consisting of two partitions /dev/hda5
+and /dev/hdb1 your /etc/raidtab would look like this:
+
+raiddev /dev/md0
+ raid-level 0
+ nr-raid-disks 2
+ nr-spare-disks 0
+ persistent-superblock 0
+ chunk-size 64k
+ device /dev/hda5
+ raid-disk 0
+ device /dev/hdb1
+ raid-disk 1
+
+For linear raid, just change the raid-level above to "raid-level linear", for
+mirrors, change it to "raid-level 1", and for stripe sets with parity, change
+it to "raid-level 5".
+
+Note for stripe sets with parity you will also need to tell the MD driver
+which parity algorithm to use by specifying the option "parity-algorithm
+which", where you need to replace "which" with the name of the algorithm to
+use (see man 5 raidtab for available algorithms) and you will have to try the
+different available algorithms until you find one that works. Make sure you
+are working read-only when playing with this as you may damage your data
+otherwise. If you find which algorithm works please let us know (email the
+linux-ntfs developers list linux-ntfs-dev@lists.sourceforge.net or drop in on
+IRC in channel #ntfs on the irc.freenode.net network) so we can update this
+documentation.
+
+Once the raidtab is setup, run for example raid0run -a to start all devices or
+raid0run /dev/md0 to start a particular md device, in this case /dev/md0.
+
+Then just use the mount command as usual to mount the ntfs volume using for
+example: mount -t ntfs -o ro /dev/md0 /mnt/myntfsvolume
+
+It is advisable to do the mount read-only to see if the md volume has been
+setup correctly to avoid the possibility of causing damage to the data on the
+ntfs volume.
+
+
+Limitations when using the Software RAID / MD driver
+-----------------------------------------------------
+
+Using the md driver will not work properly if any of your NTFS partitions have
+an odd number of sectors. This is especially important for linear raid as all
+data after the first partition with an odd number of sectors will be offset by
+one or more sectors so if you mount such a partition with write support you
+will cause massive damage to the data on the volume which will only become
+apparent when you try to use the volume again under Windows.
+
+So when using linear raid, make sure that all your partitions have an even
+number of sectors BEFORE attempting to use it. You have been warned!
+
+Even better is to simply use the Device-Mapper for linear raid and then you do
+not have this problem with odd numbers of sectors.
+
+
+ChangeLog
+=========
+
+Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
+
+2.1.30:
+ - Fix writev() (it kept writing the first segment over and over again
+ instead of moving onto subsequent segments).
+ - Fix crash in ntfs_mft_record_alloc() when mapping the new extent mft
+ record failed.
+2.1.29:
+ - Fix a deadlock when mounting read-write.
+2.1.28:
+ - Fix a deadlock.
+2.1.27:
+ - Implement page migration support so the kernel can move memory used
+ by NTFS files and directories around for management purposes.
+ - Add support for writing to sparse files created with Windows XP SP2.
+ - Many minor improvements and bug fixes.
+2.1.26:
+ - Implement support for sector sizes above 512 bytes (up to the maximum
+ supported by NTFS which is 4096 bytes).
+ - Enhance support for NTFS volumes which were supported by Windows but
+ not by Linux due to invalid attribute list attribute flags.
+ - A few minor updates and bug fixes.
+2.1.25:
+ - Write support is now extended with write(2) being able to both
+ overwrite existing file data and to extend files. Also, if a write
+ to a sparse region occurs, write(2) will fill in the hole. Note,
+ mmap(2) based writes still do not support writing into holes or
+ writing beyond the initialized size.
+ - Write support has a new feature and that is that truncate(2) and
+ open(2) with O_TRUNC are now implemented thus files can be both made
+ smaller and larger.
+ - Note: Both write(2) and truncate(2)/open(2) with O_TRUNC still have
+ limitations in that they
+ - only provide limited support for highly fragmented files.
+ - only work on regular, i.e. uncompressed and unencrypted files.
+ - never create sparse files although this will change once directory
+ operations are implemented.
+ - Lots of bug fixes and enhancements across the board.
+2.1.24:
+ - Support journals ($LogFile) which have been modified by chkdsk. This
+ means users can boot into Windows after we marked the volume dirty.
+ The Windows boot will run chkdsk and then reboot. The user can then
+ immediately boot into Linux rather than having to do a full Windows
+ boot first before rebooting into Linux and we will recognize such a
+ journal and empty it as it is clean by definition.
+ - Support journals ($LogFile) with only one restart page as well as
+ journals with two different restart pages. We sanity check both and
+ either use the only sane one or the more recent one of the two in the
+ case that both are valid.
+ - Lots of bug fixes and enhancements across the board.
+2.1.23:
+ - Stamp the user space journal, aka transaction log, aka $UsnJrnl, if
+ it is present and active thus telling Windows and applications using
+ the transaction log that changes can have happened on the volume
+ which are not recorded in $UsnJrnl.
+ - Detect the case when Windows has been hibernated (suspended to disk)
+ and if this is the case do not allow (re)mounting read-write to
+ prevent data corruption when you boot back into the suspended
+ Windows session.
+ - Implement extension of resident files using the normal file write
+ code paths, i.e. most very small files can be extended to be a little
+ bit bigger but not by much.
+ - Add new mount option "disable_sparse". (See list of mount options
+ above for details.)
+ - Improve handling of ntfs volumes with errors and strange boot sectors
+ in particular.
+ - Fix various bugs including a nasty deadlock that appeared in recent
+ kernels (around 2.6.11-2.6.12 timeframe).
+2.1.22:
+ - Improve handling of ntfs volumes with errors.
+ - Fix various bugs and race conditions.
+2.1.21:
+ - Fix several race conditions and various other bugs.
+ - Many internal cleanups, code reorganization, optimizations, and mft
+ and index record writing code rewritten to fit in with the changes.
+ - Update Documentation/filesystems/ntfs.txt with instructions on how to
+ use the Device-Mapper driver with NTFS ftdisk/LDM raid.
+2.1.20:
+ - Fix two stupid bugs introduced in 2.1.18 release.
+2.1.19:
+ - Minor bugfix in handling of the default upcase table.
+ - Many internal cleanups and improvements. Many thanks to Linus
+ Torvalds and Al Viro for the help and advice with the sparse
+ annotations and cleanups.
+2.1.18:
+ - Fix scheduling latencies at mount time. (Ingo Molnar)
+ - Fix endianness bug in a little traversed portion of the attribute
+ lookup code.
+2.1.17:
+ - Fix bugs in mount time error code paths.
+2.1.16:
+ - Implement access time updates (including mtime and ctime).
+ - Implement fsync(2), fdatasync(2), and msync(2) system calls.
+ - Enable the readv(2) and writev(2) system calls.
+ - Enable access via the asynchronous io (aio) API by adding support for
+ the aio_read(3) and aio_write(3) functions.
+2.1.15:
+ - Invalidate quotas when (re)mounting read-write.
+ NOTE: This now only leave user space journalling on the side. (See
+ note for version 2.1.13, below.)
+2.1.14:
+ - Fix an NFSd caused deadlock reported by several users.
+2.1.13:
+ - Implement writing of inodes (access time updates are not implemented
+ yet so mounting with -o noatime,nodiratime is enforced).
+ - Enable writing out of resident files so you can now overwrite any
+ uncompressed, unencrypted, nonsparse file as long as you do not
+ change the file size.
+ - Add housekeeping of ntfs system files so that ntfsfix no longer needs
+ to be run after writing to an NTFS volume.
+ NOTE: This still leaves quota tracking and user space journalling on
+ the side but they should not cause data corruption. In the worst
+ case the charged quotas will be out of date ($Quota) and some
+ userspace applications might get confused due to the out of date
+ userspace journal ($UsnJrnl).
+2.1.12:
+ - Fix the second fix to the decompression engine from the 2.1.9 release
+ and some further internals cleanups.
+2.1.11:
+ - Driver internal cleanups.
+2.1.10:
+ - Force read-only (re)mounting of volumes with unsupported volume
+ flags and various cleanups.
+2.1.9:
+ - Fix two bugs in handling of corner cases in the decompression engine.
+2.1.8:
+ - Read the $MFT mirror and compare it to the $MFT and if the two do not
+ match, force a read-only mount and do not allow read-write remounts.
+ - Read and parse the $LogFile journal and if it indicates that the
+ volume was not shutdown cleanly, force a read-only mount and do not
+ allow read-write remounts. If the $LogFile indicates a clean
+ shutdown and a read-write (re)mount is requested, empty $LogFile to
+ ensure that Windows cannot cause data corruption by replaying a stale
+ journal after Linux has written to the volume.
+ - Improve time handling so that the NTFS time is fully preserved when
+ converted to kernel time and only up to 99 nano-seconds are lost when
+ kernel time is converted to NTFS time.
+2.1.7:
+ - Enable NFS exporting of mounted NTFS volumes.
+2.1.6:
+ - Fix minor bug in handling of compressed directories that fixes the
+ erroneous "du" and "stat" output people reported.
+2.1.5:
+ - Minor bug fix in attribute list attribute handling that fixes the
+ I/O errors on "ls" of certain fragmented files found by at least two
+ people running Windows XP.
+2.1.4:
+ - Minor update allowing compilation with all gcc versions (well, the
+ ones the kernel can be compiled with anyway).
+2.1.3:
+ - Major bug fixes for reading files and volumes in corner cases which
+ were being hit by Windows 2k/XP users.
+2.1.2:
+ - Major bug fixes alleviating the hangs in statfs experienced by some
+ users.
+2.1.1:
+ - Update handling of compressed files so people no longer get the
+ frequently reported warning messages about initialized_size !=
+ data_size.
+2.1.0:
+ - Add configuration option for developmental write support.
+ - Initial implementation of file overwriting. (Writes to resident files
+ are not written out to disk yet, so avoid writing to files smaller
+ than about 1kiB.)
+ - Intercept/abort changes in file size as they are not implemented yet.
+2.0.25:
+ - Minor bugfixes in error code paths and small cleanups.
+2.0.24:
+ - Small internal cleanups.
+ - Support for sendfile system call. (Christoph Hellwig)
+2.0.23:
+ - Massive internal locking changes to mft record locking. Fixes
+ various race conditions and deadlocks.
+ - Fix ntfs over loopback for compressed files by adding an
+ optimization barrier. (gcc was screwing up otherwise ?)
+ Thanks go to Christoph Hellwig for pointing these two out:
+ - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
+ - Fix ntfs_free() for ia64 and parisc.
+2.0.22:
+ - Small internal cleanups.
+2.0.21:
+ These only affect 32-bit architectures:
+ - Check for, and refuse to mount too large volumes (maximum is 2TiB).
+ - Check for, and refuse to open too large files and directories
+ (maximum is 16TiB).
+2.0.20:
+ - Support non-resident directory index bitmaps. This means we now cope
+ with huge directories without problems.
+ - Fix a page leak that manifested itself in some cases when reading
+ directory contents.
+ - Internal cleanups.
+2.0.19:
+ - Fix race condition and improvements in block i/o interface.
+ - Optimization when reading compressed files.
+2.0.18:
+ - Fix race condition in reading of compressed files.
+2.0.17:
+ - Cleanups and optimizations.
+2.0.16:
+ - Fix stupid bug introduced in 2.0.15 in new attribute inode API.
+ - Big internal cleanup replacing the mftbmp access hacks by using the
+ new attribute inode API instead.
+2.0.15:
+ - Bug fix in parsing of remount options.
+ - Internal changes implementing attribute (fake) inodes allowing all
+ attribute i/o to go via the page cache and to use all the normal
+ vfs/mm functionality.
+2.0.14:
+ - Internal changes improving run list merging code and minor locking
+ change to not rely on BKL in ntfs_statfs().
+2.0.13:
+ - Internal changes towards using iget5_locked() in preparation for
+ fake inodes and small cleanups to ntfs_volume structure.
+2.0.12:
+ - Internal cleanups in address space operations made possible by the
+ changes introduced in the previous release.
+2.0.11:
+ - Internal updates and cleanups introducing the first step towards
+ fake inode based attribute i/o.
+2.0.10:
+ - Microsoft says that the maximum number of inodes is 2^32 - 1. Update
+ the driver accordingly to only use 32-bits to store inode numbers on
+ 32-bit architectures. This improves the speed of the driver a little.
+2.0.9:
+ - Change decompression engine to use a single buffer. This should not
+ affect performance except perhaps on the most heavy i/o on SMP
+ systems when accessing multiple compressed files from multiple
+ devices simultaneously.
+ - Minor updates and cleanups.
+2.0.8:
+ - Remove now obsolete show_inodes and posix mount option(s).
+ - Restore show_sys_files mount option.
+ - Add new mount option case_sensitive, to determine if the driver
+ treats file names as case sensitive or not.
+ - Mostly drop support for short file names (for backwards compatibility
+ we only support accessing files via their short file name if one
+ exists).
+ - Fix dcache aliasing issues wrt short/long file names.
+ - Cleanups and minor fixes.
+2.0.7:
+ - Just cleanups.
+2.0.6:
+ - Major bugfix to make compatible with other kernel changes. This fixes
+ the hangs/oopses on umount.
+ - Locking cleanup in directory operations (remove BKL usage).
+2.0.5:
+ - Major buffer overflow bug fix.
+ - Minor cleanups and updates for kernel 2.5.12.
+2.0.4:
+ - Cleanups and updates for kernel 2.5.11.
+2.0.3:
+ - Small bug fixes, cleanups, and performance improvements.
+2.0.2:
+ - Use default fmask of 0177 so that files are no executable by default.
+ If you want owner executable files, just use fmask=0077.
+ - Update for kernel 2.5.9 but preserve backwards compatibility with
+ kernel 2.5.7.
+ - Minor bug fixes, cleanups, and updates.
+2.0.1:
+ - Minor updates, primarily set the executable bit by default on files
+ so they can be executed.
+2.0.0:
+ - Started ChangeLog.
+
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 00000000000..7618a287aa4
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,102 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page: http://oss.oracle.com/projects/ocfs2
+Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker <joel.becker@oracle.com>
+Zach Brown <zach.brown@oracle.com>
+Mark Fasheh <mfasheh@suse.com>
+Kurt Hackel <kurt.hackel@oracle.com>
+Tao Ma <tao.ma@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh <manish.singh@oracle.com>
+Tiger Yang <tiger.yang@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+ - Directory change notification (F_NOTIFY)
+ - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1 This enables/disables barriers. barrier=0 disables it,
+ barrier=1 enables it.
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+intr (*) Allow signals to interrupt cluster operations.
+nointr Do not allow signals to interrupt cluster
+ operations.
+noatime Do not update access time.
+relatime(*) Update atime if the previous atime is older than
+ mtime or ctime
+strictatime Always update atime, but the minimum update interval
+ is specified by atime_quantum.
+atime_quantum=60(*) OCFS2 will not update atime unless this number
+ of seconds has passed since the last update.
+ Set to zero to always update atime. This option need
+ work with strictatime.
+data=ordered (*) All data are forced directly out to the main file
+ system prior to its metadata being committed to the
+ journal.
+data=writeback Data ordering is not preserved, data may be written
+ into the main file system after its metadata has been
+ committed to the journal.
+preferred_slot=0(*) During mount, try to use this filesystem slot first. If
+ it is in use by another node, the first empty one found
+ will be chosen. Invalid values will be ignored.
+commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
+ every 'nrsec' seconds. The default value is 5 seconds.
+ This means that if you lose your power, you will lose
+ as much as the latest 5 seconds of work (your
+ filesystem will not be damaged though, thanks to the
+ journaling). This default value (or any low value)
+ will hurt performance, but it's good for data-safety.
+ Setting it to 0 will have the same effect as leaving
+ it at the default (5 seconds).
+ Setting it to very large values will improve
+ performance.
+localalloc=8(*) Allows custom localalloc size in MB. If the value is too
+ large, the fs will silently revert it to the default.
+localflocks This disables cluster aware flock.
+inode64 Indicates that Ocfs2 is allowed to create inodes at
+ any location in the filesystem, including those which
+ will result in inode numbers occupying more than 32
+ bits of significance.
+user_xattr (*) Enables Extended User Attributes.
+nouser_xattr Disables Extended User Attributes.
+acl Enables POSIX Access Control Lists support.
+noacl (*) Disables POSIX Access Control Lists support.
+resv_level=2 (*) Set how aggressive allocation reservations will be.
+ Valid values are between 0 (reservations off) to 8
+ (maximum space for reservations).
+dir_resv_level= (*) By default, directory reservations will scale with file
+ reservations - users should rarely need to change this
+ value. If allocation reservations are turned off, this
+ option will have no effect.
+coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode
+ lock will be taken to force other nodes drop cache,
+ therefore full cluster coherency is guaranteed even
+ for O_DIRECT writes.
+coherency=buffered Allow concurrent O_DIRECT writes without EX lock among
+ nodes, which gains high performance at risk of getting
+ stale data on other nodes.
diff --git a/Documentation/filesystems/omfs.txt b/Documentation/filesystems/omfs.txt
new file mode 100644
index 00000000000..1d0d41ff5c6
--- /dev/null
+++ b/Documentation/filesystems/omfs.txt
@@ -0,0 +1,106 @@
+Optimized MPEG Filesystem (OMFS)
+
+Overview
+========
+
+OMFS is a filesystem created by SonicBlue for use in the ReplayTV DVR
+and Rio Karma MP3 player. The filesystem is extent-based, utilizing
+block sizes from 2k to 8k, with hash-based directories. This
+filesystem driver may be used to read and write disks from these
+devices.
+
+Note, it is not recommended that this FS be used in place of a general
+filesystem for your own streaming media device. Native Linux filesystems
+will likely perform better.
+
+More information is available at:
+
+ http://linux-karma.sf.net/
+
+Various utilities, including mkomfs and omfsck, are included with
+omfsprogs, available at:
+
+ http://bobcopeland.com/karma/
+
+Instructions are included in its README.
+
+Options
+=======
+
+OMFS supports the following mount-time options:
+
+ uid=n - make all files owned by specified user
+ gid=n - make all files owned by specified group
+ umask=xxx - set permission umask to xxx
+ fmask=xxx - set umask to xxx for files
+ dmask=xxx - set umask to xxx for directories
+
+Disk format
+===========
+
+OMFS discriminates between "sysblocks" and normal data blocks. The sysblock
+group consists of super block information, file metadata, directory structures,
+and extents. Each sysblock has a header containing CRCs of the entire
+sysblock, and may be mirrored in successive blocks on the disk. A sysblock may
+have a smaller size than a data block, but since they are both addressed by the
+same 64-bit block number, any remaining space in the smaller sysblock is
+unused.
+
+Sysblock header information:
+
+struct omfs_header {
+ __be64 h_self; /* FS block where this is located */
+ __be32 h_body_size; /* size of useful data after header */
+ __be16 h_crc; /* crc-ccitt of body_size bytes */
+ char h_fill1[2];
+ u8 h_version; /* version, always 1 */
+ char h_type; /* OMFS_INODE_X */
+ u8 h_magic; /* OMFS_IMAGIC */
+ u8 h_check_xor; /* XOR of header bytes before this */
+ __be32 h_fill2;
+};
+
+Files and directories are both represented by omfs_inode:
+
+struct omfs_inode {
+ struct omfs_header i_head; /* header */
+ __be64 i_parent; /* parent containing this inode */
+ __be64 i_sibling; /* next inode in hash bucket */
+ __be64 i_ct