summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/affs/affs.h8
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/aio.c10
-rw-r--r--fs/attr.c8
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/bad_inode.c1
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c20
-rw-r--r--fs/binfmt_elf_fdpic.c12
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/bio.c61
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/backref.c581
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h51
-rw-r--r--fs/btrfs/check-integrity.c600
-rw-r--r--fs/btrfs/ctree.c909
-rw-r--r--fs/btrfs/ctree.h89
-rw-r--r--fs/btrfs/delayed-inode.c26
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c168
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c15
-rw-r--r--fs/btrfs/extent-tree.c34
-rw-r--r--fs/btrfs/extent_io.c189
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/file.c91
-rw-r--r--fs/btrfs/free-space-cache.c197
-rw-r--r--fs/btrfs/inode.c442
-rw-r--r--fs/btrfs/ioctl.c168
-rw-r--r--fs/btrfs/ioctl.h35
-rw-r--r--fs/btrfs/ordered-data.c175
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/rcu-string.h56
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/scrub.c95
-rw-r--r--fs/btrfs/super.c154
-rw-r--r--fs/btrfs/transaction.c73
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c449
-rw-r--r--fs/btrfs/volumes.h57
-rw-r--r--fs/btrfs/xattr.c1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/export.c32
-rw-r--r--fs/ceph/file.c1
-rw-r--r--fs/ceph/ioctl.c102
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/mds_client.c54
-rw-r--r--fs/ceph/mds_client.h5
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/Kconfig20
-rw-r--r--fs/cifs/Makefile4
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/cifs_debug.c56
-rw-r--r--fs/cifs/cifs_debug.h4
-rw-r--r--fs/cifs/cifsfs.c25
-rw-r--r--fs/cifs/cifsglob.h113
-rw-r--r--fs/cifs/cifsproto.h14
-rw-r--r--fs/cifs/cifssmb.c207
-rw-r--r--fs/cifs/connect.c209
-rw-r--r--fs/cifs/file.c726
-rw-r--r--fs/cifs/ioctl.c8
-rw-r--r--fs/cifs/misc.c155
-rw-r--r--fs/cifs/readdir.c7
-rw-r--r--fs/cifs/smb1ops.c243
-rw-r--r--fs/cifs/smb2ops.c27
-rw-r--r--fs/cifs/transport.c104
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/compat.c47
-rw-r--r--fs/dcache.c192
-rw-r--r--fs/debugfs/file.c128
-rw-r--r--fs/devpts/inode.c24
-rw-r--r--fs/direct-io.c44
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/dlm_internal.h16
-rw-r--r--fs/dlm/lock.c541
-rw-r--r--fs/dlm/lock.h7
-rw-r--r--fs/dlm/lockspace.c20
-rw-r--r--fs/dlm/lowcomms.c28
-rw-r--r--fs/dlm/memory.c8
-rw-r--r--fs/dlm/rcom.c61
-rw-r--r--fs/dlm/recover.c73
-rw-r--r--fs/dlm/recoverd.c15
-rw-r--r--fs/dlm/requestqueue.c43
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/ecryptfs/miscdev.c48
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c94
-rw-r--r--fs/exec.c44
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/exofs.h14
-rw-r--r--fs/exofs/inode.c4
-rw-r--r--fs/exofs/ore.c8
-rw-r--r--fs/exofs/ore_raid.c91
-rw-r--r--fs/exofs/super.c14
-rw-r--r--fs/exofs/sys.c200
-rw-r--r--fs/exportfs/expfs.c33
-rw-r--r--fs/ext2/balloc.c9
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c22
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c49
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c167
-rw-r--r--fs/ext3/ext3.h14
-rw-r--r--fs/ext3/hash.c4
-rw-r--r--fs/ext3/ialloc.c20
-rw-r--r--fs/ext3/inode.c38
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c41
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/balloc.c45
-rw-r--r--fs/ext4/bitmap.c83
-rw-r--r--fs/ext4/dir.c12
-rw-r--r--fs/ext4/ext4.h134
-rw-r--r--fs/ext4/ext4_extents.h24
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c91
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c75
-rw-r--r--fs/ext4/inode.c153
-rw-r--r--fs/ext4/ioctl.c8
-rw-r--r--fs/ext4/mballoc.c24
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/mmp.c44
-rw-r--r--fs/ext4/namei.c444
-rw-r--r--fs/ext4/resize.c69
-rw-r--r--fs/ext4/super.c295
-rw-r--r--fs/ext4/xattr.c92
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/fatent.c21
-rw-r--r--fs/fat/inode.c76
-rw-r--r--fs/fcntl.c48
-rw-r--r--fs/fifo.c9
-rw-r--r--fs/file_table.c17
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c337
-rw-r--r--fs/fuse/control.c10
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/file.c44
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c19
-rw-r--r--fs/gfs2/acl.c12
-rw-r--r--fs/gfs2/aops.c18
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/export.c17
-rw-r--r--fs/gfs2/file.c12
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/log.c103
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c520
-rw-r--r--fs/gfs2/lops.h14
-rw-r--r--fs/gfs2/main.c26
-rw-r--r--fs/gfs2/meta_io.c28
-rw-r--r--fs/gfs2/meta_io.h4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/gfs2/rgrp.c102
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/sys.c10
-rw-r--r--fs/gfs2/trace_gfs2.h16
-rw-r--r--fs/gfs2/trans.c44
-rw-r--r--fs/gfs2/util.c3
-rw-r--r--fs/gfs2/util.h3
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/alloc.c14
-rw-r--r--fs/hpfs/anode.c43
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/dnode.c10
-rw-r--r--fs/hpfs/ea.c60
-rw-r--r--fs/hpfs/hpfs.h289
-rw-r--r--fs/hpfs/hpfs_fn.h23
-rw-r--r--fs/hpfs/inode.c4
-rw-r--r--fs/hpfs/map.c20
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c4
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c198
-rw-r--r--fs/internal.h3
-rw-r--r--fs/ioprio.c20
-rw-r--r--fs/isofs/export.c13
-rw-r--r--fs/jbd/checkpoint.c23
-rw-r--r--fs/jbd/commit.c21
-rw-r--r--fs/jbd/journal.c206
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/Kconfig2
-rw-r--r--fs/jbd2/commit.c70
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/recovery.c126
-rw-r--r--fs/jbd2/revoke.c27
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h11
-rw-r--r--fs/jffs2/nodemgmt.c42
-rw-r--r--fs/jffs2/os-linux.h7
-rw-r--r--fs/jffs2/readinode.c19
-rw-r--r--fs/jffs2/super.c38
-rw-r--r--fs/jffs2/wbuf.c55
-rw-r--r--fs/jffs2/xattr.c23
-rw-r--r--fs/jffs2/xattr.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/svc.c143
-rw-r--r--fs/locks.c15
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/namei.c306
-rw-r--r--fs/namespace.c139
-rw-r--r--fs/ncpfs/file.c6
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/ncpfs/ncp_fs_sb.h10
-rw-r--r--fs/nfs/Kconfig11
-rw-r--r--fs/nfs/Makefile5
-rw-r--r--fs/nfs/blocklayout/blocklayout.c90
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c2
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/callback_xdr.c8
-rw-r--r--fs/nfs/client.c267
-rw-r--r--fs/nfs/delegation.c16
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c100
-rw-r--r--fs/nfs/direct.c756
-rw-r--r--fs/nfs/file.c85
-rw-r--r--fs/nfs/fscache.c15
-rw-r--r--fs/nfs/fscache.h10
-rw-r--r--fs/nfs/getroot.c85
-rw-r--r--fs/nfs/idmap.c33
-rw-r--r--fs/nfs/inode.c127
-rw-r--r--fs/nfs/internal.h139
-rw-r--r--fs/nfs/namespace.c103
-rw-r--r--fs/nfs/netns.h5
-rw-r--r--fs/nfs/nfs2xdr.c5
-rw-r--r--fs/nfs/nfs3proc.c30
-rw-r--r--fs/nfs/nfs3xdr.c112
-rw-r--r--fs/nfs/nfs4_fs.h25
-rw-r--r--fs/nfs/nfs4filelayout.c688
-rw-r--r--fs/nfs/nfs4filelayout.h63
-rw-r--r--fs/nfs/nfs4filelayoutdev.c102
-rw-r--r--fs/nfs/nfs4namespace.c55
-rw-r--r--fs/nfs/nfs4proc.c577
-rw-r--r--fs/nfs/nfs4renewd.c2
-rw-r--r--fs/nfs/nfs4state.c245
-rw-r--r--fs/nfs/nfs4xdr.c404
-rw-r--r--fs/nfs/objlayout/objio_osd.c41
-rw-r--r--fs/nfs/objlayout/objlayout.c19
-rw-r--r--fs/nfs/pagelist.c61
-rw-r--r--fs/nfs/pnfs.c365
-rw-r--r--fs/nfs/pnfs.h127
-rw-r--r--fs/nfs/proc.c26
-rw-r--r--fs/nfs/read.c437
-rw-r--r--fs/nfs/super.c765
-rw-r--r--fs/nfs/write.c812
-rw-r--r--fs/nfsd/auth.c7
-rw-r--r--fs/nfsd/export.c181
-rw-r--r--fs/nfsd/fault_inject.c1
-rw-r--r--fs/nfsd/idmap.h8
-rw-r--r--fs/nfsd/netns.h6
-rw-r--r--fs/nfsd/nfs4callback.c5
-rw-r--r--fs/nfsd/nfs4idmap.c113
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c538
-rw-r--r--fs/nfsd/nfs4xdr.c62
-rw-r--r--fs/nfsd/nfsctl.c55
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfssvc.c8
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nfsd/xdr4.h6
-rw-r--r--fs/nilfs2/file.c24
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/ioctl.c8
-rw-r--r--fs/nilfs2/namei.c24
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nls/Kconfig157
-rw-r--r--fs/nls/Makefile11
-rw-r--r--fs/nls/mac-celtic.c602
-rw-r--r--fs/nls/mac-centeuro.c532
-rw-r--r--fs/nls/mac-croatian.c602
-rw-r--r--fs/nls/mac-cyrillic.c497
-rw-r--r--fs/nls/mac-gaelic.c567
-rw-r--r--fs/nls/mac-greek.c497
-rw-r--r--fs/nls/mac-iceland.c602
-rw-r--r--fs/nls/mac-inuit.c532
-rw-r--r--fs/nls/mac-roman.c637
-rw-r--r--fs/nls/mac-romanian.c602
-rw-r--r--fs/nls/mac-turkish.c602
-rw-r--r--fs/notify/fsnotify.c12
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ocfs2/blockcheck.c42
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c33
-rw-r--r--fs/ocfs2/export.c19
-rw-r--r--fs/ocfs2/extent_map.c2
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/inode.c15
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/move_extents.c6
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/symlink.c115
-rw-r--r--fs/ocfs2/symlink.h2
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c100
-rw-r--r--fs/pipe.c9
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/proc/array.c162
-rw-r--r--fs/proc/base.c179
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c82
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc_namespace.c4
-rw-r--r--fs/pstore/Kconfig17
-rw-r--r--fs/pstore/Makefile3
-rw-r--r--fs/pstore/inode.c4
-rw-r--r--fs/pstore/platform.c34
-rw-r--r--fs/pstore/ram.c386
-rw-r--r--fs/pstore/ram_core.c539
-rw-r--r--fs/quota/dquot.c32
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c7
-rw-r--r--fs/readdir.c33
-rw-r--r--fs/reiserfs/inode.c34
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/reiserfs.h12
-rw-r--r--fs/reiserfs/resize.c1
-rw-r--r--fs/reiserfs/super.c80
-rw-r--r--fs/select.c4
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/splice.c45
-rw-r--r--fs/stat.c61
-rw-r--r--fs/statfs.c5
-rw-r--r--fs/sync.c5
-rw-r--r--fs/sysfs/dir.c37
-rw-r--r--fs/sysfs/inode.c6
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/ubifs/Kconfig23
-rw-r--r--fs/ubifs/Makefile5
-rw-r--r--fs/ubifs/commit.c14
-rw-r--r--fs/ubifs/debug.c170
-rw-r--r--fs/ubifs/debug.h217
-rw-r--r--fs/ubifs/dir.c21
-rw-r--r--fs/ubifs/file.c4
-rw-r--r--fs/ubifs/find.c4
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c74
-rw-r--r--fs/ubifs/journal.c10
-rw-r--r--fs/ubifs/log.c18
-rw-r--r--fs/ubifs/lprops.c18
-rw-r--r--fs/ubifs/lpt.c78
-rw-r--r--fs/ubifs/lpt_commit.c86
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/orphan.c25
-rw-r--r--fs/ubifs/recovery.c43
-rw-r--r--fs/ubifs/replay.c27
-rw-r--r--fs/ubifs/sb.c34
-rw-r--r--fs/ubifs/scan.c14
-rw-r--r--fs/ubifs/super.c33
-rw-r--r--fs/ubifs/tnc.c28
-rw-r--r--fs/ubifs/tnc_commit.c28
-rw-r--r--fs/ubifs/tnc_misc.c36
-rw-r--r--fs/ubifs/ubifs.h26
-rw-r--r--fs/ubifs/xattr.c10
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/udf/namei.c16
-rw-r--r--fs/udf/super.c102
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c5
-rw-r--r--fs/utimes.c5
-rw-r--r--fs/xattr.c20
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h21
-rw-r--r--fs/xfs/xfs_ag.h18
-rw-r--r--fs/xfs/xfs_alloc.c605
-rw-r--r--fs/xfs/xfs_alloc.h28
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_aops.c229
-rw-r--r--fs/xfs/xfs_attr.c25
-rw-r--r--fs/xfs/xfs_attr_leaf.c3
-rw-r--r--fs/xfs/xfs_bmap.c32
-rw-r--r--fs/xfs/xfs_bmap.h3
-rw-r--r--fs/xfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/xfs_btree.c1
-rw-r--r--fs/xfs/xfs_buf.c638
-rw-r--r--fs/xfs/xfs_buf.h97
-rw-r--r--fs/xfs/xfs_buf_item.c123
-rw-r--r--fs/xfs/xfs_da_btree.c17
-rw-r--r--fs/xfs/xfs_dfrag.c2
-rw-r--r--fs/xfs/xfs_dir2.c1
-rw-r--r--fs/xfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/xfs_dir2_data.c1
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1
-rw-r--r--fs/xfs/xfs_dir2_node.c1
-rw-r--r--fs/xfs/xfs_dir2_sf.c1
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c91
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_dquot_item.c162
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c24
-rw-r--r--fs/xfs/xfs_extent_busy.c603
-rw-r--r--fs/xfs/xfs_extent_busy.h69
-rw-r--r--fs/xfs/xfs_extfree_item.c59
-rw-r--r--fs/xfs/xfs_file.c334
-rw-r--r--fs/xfs/xfs_fsops.c82
-rw-r--r--fs/xfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/xfs_ialloc.h9
-rw-r--r--fs/xfs/xfs_ialloc_btree.c1
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c132
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_inode_item.c183
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h5
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c59
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_itable.c1
-rw-r--r--fs/xfs/xfs_log.c126
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c265
-rw-r--r--fs/xfs/xfs_log_priv.h48
-rw-r--r--fs/xfs/xfs_log_recover.c139
-rw-r--r--fs/xfs/xfs_message.c1
-rw-r--r--fs/xfs/xfs_mount.c77
-rw-r--r--fs/xfs/xfs_mount.h6
-rw-r--r--fs/xfs/xfs_qm.c196
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rw.c156
-rw-r--r--fs/xfs/xfs_rw.h47
-rw-r--r--fs/xfs/xfs_super.c51
-rw-r--r--fs/xfs/xfs_sync.c265
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h71
-rw-r--r--fs/xfs/xfs_trans.c9
-rw-r--r--fs/xfs/xfs_trans.h20
-rw-r--r--fs/xfs/xfs_trans_ail.c207
-rw-r--r--fs/xfs/xfs_trans_buf.c126
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_trans_priv.h12
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c31
489 files changed, 26018 insertions, 12191 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 014c8dd6296..57ccb7537da 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -448,7 +448,7 @@ void v9fs_evict_inode(struct inode *inode)
struct v9fs_inode *v9inode = V9FS_I(inode);
truncate_inode_pages(inode->i_mapping, 0);
- end_writeback(inode);
+ clear_inode(inode);
filemap_fdatawrite(inode->i_mapping);
#ifdef CONFIG_9P_FSCACHE
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a1e6c990cd4..e3dd2a1e2bf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
- struct dentry *dentry;
-
- spin_lock(&inode->i_lock);
- /* Directory should have only one entry. */
- BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
- dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
- spin_unlock(&inode->i_lock);
- return dentry;
-}
-
static int v9fs_test_inode_dotl(struct inode *inode, void *data)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
if (dir->i_mode & S_ISGID)
omode |= S_ISGID;
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid))
return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
return -EINVAL;
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = v9fs_dentry_from_dir_inode(dir);
+ dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index e95d1b64082..02257420274 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -33,7 +33,7 @@ config ARCH_BINFMT_ELF_RANDOMIZE_PIE
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
- depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
+ depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
help
ELF FDPIC binaries are based on ELF, but allow the individual load
segments of a binary to be located in memory independently of each
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 45a0ce45d7b..1fceb320d2f 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -18,14 +18,6 @@
#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS 0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS 0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data)
#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data)
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 88a4b0b5005..8bc4a59f4e7 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -264,7 +264,7 @@ affs_evict_inode(struct inode *inode)
}
invalidate_inode_buffers(inode);
- end_writeback(inode);
+ clear_inode(inode);
affs_free_prealloc(inode);
cache_page = (unsigned long)AFFS_I(inode)->i_lc;
if (cache_page) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d890ae3b2ce..95cffd38239 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -423,7 +423,7 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
+ clear_inode(inode);
afs_give_up_callback(vnode);
diff --git a/fs/aio.c b/fs/aio.c
index e7f2fad7b4c..55c4c765605 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
info->mmap_size = nr_pages * PAGE_SIZE;
dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
- 0);
+ info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0);
if (IS_ERR((void *)info->mmap_base)) {
up_write(&ctx->mm->mmap_sem);
info->mmap_size = 0;
@@ -1446,13 +1446,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
ret = compat_rw_copy_check_uvector(type,
(struct compat_iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec, 1);
+ &kiocb->ki_iovec);
else
#endif
ret = rw_copy_check_uvector(type,
(struct iovec __user *)kiocb->ki_buf,
kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
- &kiocb->ki_iovec, 1);
+ &kiocb->ki_iovec);
if (ret < 0)
goto out;
diff --git a/fs/attr.c b/fs/attr.c
index d94d1b63c63..0da90951d27 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -47,14 +47,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* Make sure a caller can chown. */
if ((ia_valid & ATTR_UID) &&
- (current_fsuid() != inode->i_uid ||
- attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
+ (!uid_eq(current_fsuid(), inode->i_uid) ||
+ !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) &&
- (current_fsuid() != inode->i_uid ||
- (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
+ (!uid_eq(current_fsuid(), inode->i_uid) ||
+ (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
!capable(CAP_CHOWN))
return -EPERM;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 6e488ebe778..8a4fed8ead3 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -100,7 +100,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
static void autofs4_evict_inode(struct inode *inode)
{
- end_writeback(inode);
+ clear_inode(inode);
kfree(inode->i_private);
}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 37268c5bb98..1b35d6bd06b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -292,7 +292,6 @@ static const struct inode_operations bad_inode_ops =
.getxattr = bad_inode_getxattr,
.listxattr = bad_inode_listxattr,
.removexattr = bad_inode_removexattr,
- /* truncate_range returns void */
};
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e23dc7c8b88..9870417c26e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -174,7 +174,7 @@ static void bfs_evict_inode(struct inode *inode)
truncate_inode_pages(&inode->i_data, 0);
invalidate_inode_buffers(inode);
- end_writeback(inode);
+ clear_inode(inode);
if (inode->i_nlink)
return;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 16f73541707..1b52956afe3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -226,10 +226,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_BASE, interp_load_addr);
NEW_AUX_ENT(AT_FLAGS, 0);
NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
- NEW_AUX_ENT(AT_UID, cred->uid);
- NEW_AUX_ENT(AT_EUID, cred->euid);
- NEW_AUX_ENT(AT_GID, cred->gid);
- NEW_AUX_ENT(AT_EGID, cred->egid);
+ NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+ NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+ NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+ NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
if (!size)
return addr;
- down_write(&current->mm->mmap_sem);
/*
* total_size is the size of the ELF (interpreter) image.
* The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
*/
if (total_size) {
total_size = ELF_PAGEALIGN(total_size);
- map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+ map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
if (!BAD_ADDR(map_addr))
- do_munmap(current->mm, map_addr+size, total_size-size);
+ vm_munmap(map_addr+size, total_size-size);
} else
- map_addr = do_mmap(filep, addr, size, prot, type, off);
+ map_addr = vm_mmap(filep, addr, size, prot, type, off);
- up_write(&current->mm->mmap_sem);
return(map_addr);
}
@@ -1356,8 +1354,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_flag = p->flags;
rcu_read_lock();
cred = __task_cred(p);
- SET_UID(psinfo->pr_uid, cred->uid);
- SET_GID(psinfo->pr_gid, cred->gid);
+ SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+ SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d390a0fffc6..3d77cf81ba3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -627,10 +627,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr);
NEW_AUX_ENT(AT_FLAGS, 0);
NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr);
- NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid);
- NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid);
- NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid);
- NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid);
+ NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
+ NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
+ NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
+ NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
@@ -1421,8 +1421,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_flag = p->flags;
rcu_read_lock();
cred = __task_cred(p);
- SET_UID(psinfo->pr_uid, cred->uid);
- SET_GID(psinfo->pr_gid, cred->gid);
+ SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+ SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 6b2daf99fab..178cb70acc2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
realdatastart = (unsigned long) -ENOMEM;
printk("Unable to allocate RAM for process data, errno %d\n",
(int)-realdatastart);
- do_munmap(current->mm, textpos, text_len);
+ vm_munmap(textpos, text_len);
ret = realdatastart;
goto err;
}
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read data+bss, errno %d\n", (int)-result);
- do_munmap(current->mm, textpos, text_len);
- do_munmap(current->mm, realdatastart, len);
+ vm_munmap(textpos, text_len);
+ vm_munmap(realdatastart, len);
ret = result;
goto err;
}
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (IS_ERR_VALUE(result)) {
printk("Unable to read code+data+bss, errno %d\n",(int)-result);
- do_munmap(current->mm, textpos, text_len + data_len + extra +
+ vm_munmap(textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
ret = result;
goto err;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 613aa061823..790b3cddca6 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -505,7 +505,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
static void bm_evict_inode(struct inode *inode)
{
- end_writeback(inode);
+ clear_inode(inode);
kfree(inode->i_private);
}
diff --git a/fs/bio.c b/fs/bio.c
index 84da8853904..73922abba83 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,12 +19,14 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
+#include <linux/cgroup.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#include <trace/events/block.h>
@@ -418,6 +420,7 @@ void bio_put(struct bio *bio)
* last put frees it
*/
if (atomic_dec_and_test(&bio->bi_cnt)) {
+ bio_disassociate_task(bio);
bio->bi_next = NULL;
bio->bi_destructor(bio);
}
@@ -1646,6 +1649,64 @@ bad:
}
EXPORT_SYMBOL(bioset_create);
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * bio_associate_current - associate a bio with %current
+ * @bio: target bio
+ *
+ * Associate @bio with %current if it hasn't been associated yet. Block
+ * layer will treat @bio as if it were issued by %current no matter which
+ * task actually issues it.
+ *
+ * This function takes an extra reference of @task's io_context and blkcg
+ * which will be put when @bio is released. The caller must own @bio,
+ * ensure %current->io_context exists, and is responsible for synchronizing
+ * calls to this function.
+ */
+int bio_associate_current(struct bio *bio)
+{
+ struct io_context *ioc;
+ struct cgroup_subsys_state *css;
+
+ if (bio->bi_ioc)
+ return -EBUSY;
+
+ ioc = current->io_context;
+ if (!ioc)
+ return -ENOENT;
+
+ /* acquire active ref on @ioc and associate */
+ get_io_context_active(ioc);
+ bio->bi_ioc = ioc;
+
+ /* associate blkcg if exists */
+ rcu_read_lock();
+ css = task_subsys_state(current, blkio_subsys_id);
+ if (css && css_tryget(css))
+ bio->bi_css = css;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/**
+ * bio_disassociate_task - undo bio_associate_current()
+ * @bio: target bio
+ */
+void bio_disassociate_task(struct bio *bio)
+{
+ if (bio->bi_ioc) {
+ put_io_context(bio->bi_ioc);
+ bio->bi_ioc = NULL;
+ }
+ if (bio->bi_css) {
+ css_put(bio->bi_css);
+ bio->bi_css = NULL;
+ }
+}
+
+#endif /* CONFIG_BLK_CGROUP */
+
static void __init biovec_init_slabs(void)
{
int i;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ba11c30f302..c2bbe1fb132 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -487,7 +487,7 @@ static void bdev_evict_inode(struct inode *inode)
struct list_head *p;
truncate_inode_pages(&inode->i_data, 0);
invalidate_inode_buffers(inode); /* is it needed here? */
- end_writeback(inode);
+ clear_inode(inode);
spin_lock(&bdev_lock);
while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
__bd_forget(list_entry(p, struct inode, i_devices));
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d6..761e2cd8fed 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
if (ret > 0) {
/* we need an acl */
ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+ } else {
+ cache_no_acl(inode);
}
+ } else {
+ cache_no_acl(inode);
}
failed:
posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec0675023..a383c18e74e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
#include "delayed-ref.h"
#include "locking.h"
+struct extent_inode_elem {
+ u64 inum;
+ u64 offset;
+ struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+ struct btrfs_file_extent_item *fi,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 data_offset;
+ u64 data_len;
+ struct extent_inode_elem *e;
+
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ return 1;
+
+ e = kmalloc(sizeof(*e), GFP_NOFS);
+ if (!e)
+ return -ENOMEM;
+
+ e->next = *eie;
+ e->inum = key->objectid;
+ e->offset = key->offset + (extent_item_pos - data_offset);
+ *eie = e;
+
+ return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+ u64 extent_item_pos,
+ struct extent_inode_elem **eie)
+{
+ u64 disk_byte;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ int slot;
+ int nritems;
+ int extent_type;
+ int ret;
+
+ /*
+ * from the shared data ref, we only have the leaf but we need
+ * the key. thus, we must look into all items and see that we
+ * find one (some) with a reference to our extent item.
+ */
+ nritems = btrfs_header_nritems(eb);
+ for (slot = 0; slot < nritems; ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte != wanted_disk_byte)
+ continue;
+
+ ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* this structure records all encountered refs on the way up to the root
*/
struct __prelim_ref {
struct list_head list;
u64 root_id;
- struct btrfs_key key;
+ struct btrfs_key key_for_search;
int level;
int count;
+ struct extent_inode_elem *inode_list;
u64 parent;
u64 wanted_disk_byte;
};
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ * block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | - | -
+ * key to resolve | - | y | y | y
+ * tree block logical | - | - | - | -
+ * root for resolving | y | y | y | y
+ *
+ * - column 1: we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ * backref type | shared | indirect | shared | indirect
+ * information | tree | tree | data | data
+ * --------------------+--------+----------+--------+----------
+ * parent logical | y | - | y | -
+ * key to resolve | - | - | - | y
+ * tree block logical | y | y | y | y
+ * root for resolving | - | y | y | y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2: we take the first key from the block to find the parent
+ * (see __add_missing_keys)
+ * - column 4: we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
static int __add_prelim_ref(struct list_head *head, u64 root_id,
- struct btrfs_key *key, int level, u64 parent,
- u64 wanted_disk_byte, int count)
+ struct btrfs_key *key, int level,
+ u64 parent, u64 wanted_disk_byte, int count)
{
struct __prelim_ref *ref;
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
ref->root_id = root_id;
if (key)
- ref->key = *key;
+ ref->key_for_search = *key;
else
- memset(&ref->key, 0, sizeof(ref->key));
+ memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+ ref->inode_list = NULL;
ref->level = level;
ref->count = count;
ref->parent = parent;
@@ -64,52 +178,75 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
}
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents,
- struct extent_buffer *eb, int level,
- u64 wanted_objectid, u64 wanted_disk_byte)
+ struct ulist *parents, int level,
+ struct btrfs_key *key_for_search, u64 time_seq,
+ u64 wanted_disk_byte,
+ const u64 *extent_item_pos)
{
- int ret;
+ int ret = 0;
int slot;
- struct btrfs_file_extent_item *fi;
+ struct extent_buffer *eb;
struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ struct extent_inode_elem *eie = NULL;
u64 disk_byte;
-add_parent:
- ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
- if (ret < 0)
- return ret;
-
- if (level != 0)
+ if (level != 0) {
+ eb = path->nodes[level];
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
return 0;
+ }
/*
- * if the current leaf is full with EXTENT_DATA items, we must
- * check the next one if that holds a reference as well.
- * ref->count cannot be used to skip this check.
- * repeat this until we don't find any additional EXTENT_DATA items.
+ * We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot==nritems. In that case, go to the next leaf before we continue.
*/
- while (1) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ while (!ret) {
eb = path->nodes[0];
- for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != wanted_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
- fi = btrfs_item_ptr(eb, slot,
- struct btrfs_file_extent_item);
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte == wanted_disk_byte)
- goto add_parent;
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+
+ if (key.objectid != key_for_search->objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+ if (disk_byte == wanted_disk_byte) {
+ eie = NULL;
+ if (extent_item_pos) {
+ ret = check_extent_in_eb(&key, eb, fi,
+ *extent_item_pos,
+ &eie);
+ if (ret < 0)
+ break;
+ }
+ if (!ret) {
+ ret = ulist_add(parents, eb->start,
+ (unsigned long)eie, GFP_NOFS);
+ if (ret < 0)
+ break;
+ if (!extent_item_pos) {
+ ret = btrfs_next_old_leaf(root, path,
+ time_seq);
+ continue;
+ }
+ }
}
+ ret = btrfs_next_old_item(root, path, time_seq);
}
- return 0;
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -118,13 +255,14 @@ add_parent:
*/
static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int search_commit_root,
+ u64 time_seq,
struct __prelim_ref *ref,
- struct ulist *parents)
+ struct ulist *parents,
+ const u64 *extent_item_pos)
{
struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_key root_key;
- struct btrfs_key key = {0};
struct extent_buffer *eb;
int ret = 0;
int root_level;
@@ -152,36 +290,30 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
path->lowest_level = level;
- ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
pr_debug("search slot in root %llu (level %d, ref count %d) returned "
"%d for key (%llu %u %llu)\n",
(unsigned long long)ref->root_id, level, ref->count, ret,
- (unsigned long long)ref->key.objectid, ref->key.type,
- (unsigned long long)ref->key.offset);
+ (unsigned long long)ref->key_for_search.objectid,
+ ref->key_for_search.type,
+ (unsigned long long)ref->key_for_search.offset);
if (ret < 0)
goto out;
eb = path->nodes[level];
- if (!eb) {
- WARN_ON(1);
- ret = 1;
- goto out;
- }
-
- if (level == 0) {
- if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- goto out;
- eb = path->nodes[0];
+ while (!eb) {
+ if (!level) {
+ WARN_ON(1);
+ ret = 1;
+ goto out;
}
-
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ level--;
+ eb = path->nodes[level];
}
- /* the last two parameters will only be used for level == 0 */
- ret = add_all_parents(root, path, parents, eb, level, key.objectid,
- ref->wanted_disk_byte);
+ ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
+ time_seq, ref->wanted_disk_byte,
+ extent_item_pos);
out:
btrfs_free_path(path);
return ret;
@@ -191,8 +323,9 @@ out:
* resolve all indirect backrefs from the list
*/
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
- int search_commit_root,
- struct list_head *head)
+ int search_commit_root, u64 time_seq,
+ struct list_head *head,
+ const u64 *extent_item_pos)
{
int err;
int ret = 0;
@@ -201,6 +334,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct __prelim_ref *new_ref;
struct ulist *parents;
struct ulist_node *node;
+ struct ulist_iterator uiter;
parents = ulist_alloc(GFP_NOFS);
if (!parents)
@@ -217,7 +351,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
if (ref->count == 0)
continue;
err = __resolve_indirect_ref(fs_info, search_commit_root,
- ref, parents);
+ time_seq, ref, parents,
+ extent_item_pos);
if (err) {
if (ret == 0)
ret = err;
@@ -225,11 +360,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
/* we put the first parent into the ref at hand */
- node = ulist_next(parents, NULL);
+ ULIST_ITER_INIT(&uiter);
+ node = ulist_next(parents, &uiter);
ref->parent = node ? node->val : 0;
+ ref->inode_list =
+ node ? (struct extent_inode_elem *)node->aux : 0;
/* additional parents require new refs being added here */
- while ((node = ulist_next(parents, node))) {
+ while ((node = ulist_next(parents, &uiter))) {
new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
if (!new_ref) {
ret = -ENOMEM;
@@ -237,6 +375,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
}
memcpy(new_ref, ref, sizeof(*ref));
new_ref->parent = node->val;
+ new_ref->inode_list =
+ (struct extent_inode_elem *)node->aux;
list_add(&new_ref->list, &ref->list);
}
ulist_reinit(parents);
@@ -246,10 +386,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
return ret;
}
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+ struct __prelim_ref *ref2)
+{
+ if (ref1->level != ref2->level)
+ return 0;
+ if (ref1->root_id != ref2->root_id)
+ return 0;
+ if (ref1->key_for_search.type != ref2->key_for_search.type)
+ return 0;
+ if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+ return 0;
+ if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+ return 0;
+ if (ref1->parent != ref2->parent)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+ struct list_head *head)
+{
+ struct list_head *pos;
+ struct extent_buffer *eb;
+
+ list_for_each(pos, head) {
+ struct __prelim_ref *ref;
+ ref = list_entry(pos, struct __prelim_ref, list);
+
+ if (ref->parent)
+ continue;
+ if (ref->key_for_search.type)
+ continue;
+ BUG_ON(!ref->wanted_disk_byte);
+ eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+ fs_info->tree_root->leafsize, 0);
+ BUG_ON(!eb);
+ btrfs_tree_read_lock(eb);
+ if (btrfs_header_level(eb) == 0)
+ btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+ else
+ btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return 0;
+}
+
/*
* merge two lists of backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
+ * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ * additionally, we could even add a key range for the blocks we
+ * looked into to merge even more (-> replace unresolved refs by those
+ * having a parent).
* mode = 2: merge identical parents
*/
static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +458,21 @@ static int __merge_refs(struct list_head *head, int mode)
ref1 = list_entry(pos1, struct __prelim_ref, list);
- if (mode == 1 && ref1->key.type == 0)
- continue;
for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
pos2 = n2, n2 = pos2->next) {
struct __prelim_ref *ref2;
+ struct __prelim_ref *xchg;
ref2 = list_entry(pos2, struct __prelim_ref, list);
if (mode == 1) {
- if (memcmp(&ref1->key, &ref2->key,
- sizeof(ref1->key)) ||
- ref1->level != ref2->level ||
- ref1->root_id != ref2->root_id)
+ if (!ref_for_same_block(ref1, ref2))
continue;
+ if (!ref1->parent && ref2->parent) {
+ xchg = ref1;
+ ref1 = ref2;
+ ref2 = xchg;
+ }
ref1->count += ref2->count;
} else {
if (ref1->parent != ref2->parent)
@@ -296,16 +492,17 @@ static int __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct btrfs_key *info_key,
struct list_head *prefs)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
+ struct btrfs_key key;
+ struct btrfs_key op_key = {0};
int sgn;
int ret = 0;
if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+ btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
while ((n = rb_prev(n))) {
struct btrfs_delayed_ref_node *node;
@@ -337,7 +534,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ret = __add_prelim_ref(prefs, ref->root, &op_key,
ref->level + 1, 0, node->bytenr,
node->ref_mod * sgn);
break;
@@ -346,7 +543,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ret = __add_prelim_ref(prefs, ref->root, NULL,
ref->level + 1, ref->parent,
node->bytenr,
node->ref_mod * sgn);
@@ -354,8 +551,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
}
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key key;
-
ref = btrfs_delayed_node_to_data_ref(node);
key.objectid = ref->objectid;
@@ -368,7 +563,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
}
case BTRFS_SHARED_DATA_REF_KEY: {
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key key;
ref = btrfs_delayed_node_to_data_ref(node);
@@ -394,8 +588,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
*/
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- struct btrfs_key *info_key, int *info_level,
- struct list_head *prefs)
+ int *info_level, struct list_head *prefs)
{
int ret = 0;
int slot;
@@ -411,7 +604,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
* enumerate all inline refs
*/
leaf = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
item_size = btrfs_item_size_nr(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +617,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
- struct btrfs_disk_key disk_key;
info = (struct btrfs_tree_block_info *)ptr;
*info_level = btrfs_tree_block_level(leaf, info);
- btrfs_tree_block_key(leaf, info, &disk_key);
- btrfs_disk_key_to_cpu(info_key, &disk_key);
ptr += sizeof(struct btrfs_tree_block_info);
BUG_ON(ptr > end);
} else {
@@ -447,7 +637,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
switch (type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, 0, info_key,
+ ret = __add_prelim_ref(prefs, 0, NULL,
*info_level + 1, offset,
bytenr, 1);
break;
@@ -462,8 +652,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, offset, info_key,
- *info_level + 1, 0, bytenr, 1);
+ ret = __add_prelim_ref(prefs, offset, NULL,
+ *info_level + 1, 0,
+ bytenr, 1);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -477,8 +668,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
- ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
- count);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
break;
}
default:
@@ -496,8 +687,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
*/
static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- struct btrfs_key *info_key, int info_level,
- struct list_head *prefs)
+ int info_level, struct list_head *prefs)
{
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
@@ -527,7 +717,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
switch (key.type) {
case BTRFS_SHARED_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, 0, info_key,
+ ret = __add_prelim_ref(prefs, 0, NULL,
info_level + 1, key.offset,
bytenr, 1);
break;
@@ -543,8 +733,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
break;
}
case BTRFS_TREE_BLOCK_REF_KEY:
- ret = __add_prelim_ref(prefs, key.offset, info_key,
- info_level + 1, 0, bytenr, 1);
+ ret = __add_prelim_ref(prefs, key.offset, NULL,
+ info_level + 1, 0,
+ bytenr, 1);
break;
case BTRFS_EXTENT_DATA_REF_KEY: {
struct btrfs_extent_data_ref *dref;
@@ -560,7 +751,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
ret = __add_prelim_ref(prefs, root, &key, 0, 0,
- bytenr, count);
+ bytenr, count);
break;
}
default:
@@ -582,11 +773,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 seq, struct ulist *refs, struct ulist *roots)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist *refs, struct ulist *roots,
+ const u64 *extent_item_pos)
{
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_key info_key = { 0 };
struct btrfs_delayed_ref_root *delayed_refs = NULL;
struct btrfs_delayed_ref_head *head;
int info_level = 0;
@@ -645,8 +837,9 @@ again:
btrfs_put_delayed_ref(&head->node);
goto again;
}
- ret = __add_delayed_refs(head, seq, &info_key,
+ ret = __add_delayed_refs(head, delayed_ref_seq,
&prefs_delayed);
+ mutex_unlock(&head->mutex);
if (ret) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -659,16 +852,17 @@ again:
struct extent_buffer *leaf;
int slot;
+ path->slots[0]--;
leaf = path->nodes[0];
- slot = path->slots[0] - 1;
+ slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
ret = __add_inline_refs(fs_info, path, bytenr,
- &info_key, &info_level, &prefs);
+ &info_level, &prefs);
if (ret)
goto out;
- ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+ ret = __add_keyed_refs(fs_info, path, bytenr,
info_level, &prefs);
if (ret)
goto out;
@@ -676,21 +870,18 @@ again:
}
btrfs_release_path(path);
- /*
- * when adding the delayed refs above, the info_key might not have
- * been known yet. Go over the list and replace the missing keys
- */
- list_for_each_entry(ref, &prefs_delayed, list) {
- if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
- memcpy(&ref->key, &info_key, sizeof(ref->key));
- }
list_splice_init(&prefs_delayed, &prefs);
+ ret = __add_missing_keys(fs_info, &prefs);
+ if (ret)
+ goto out;
+
ret = __merge_refs(&prefs, 1);
if (ret)
goto out;
- ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+ ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+ &prefs, extent_item_pos);
if (ret)
goto out;
@@ -709,15 +900,39 @@ again:
BUG_ON(ret < 0);
}
if (ref->count && ref->parent) {
- ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+ struct extent_inode_elem *eie = NULL;
+ if (extent_item_pos && !ref->inode_list) {
+ u32 bsz;
+ struct extent_buffer *eb;
+ bsz = btrfs_level_size(fs_info->extent_root,
+ info_level);
+ eb = read_tree_block(fs_info->extent_root,
+ ref->parent, bsz, 0);
+ BUG_ON(!eb);
+ ret = find_extent_in_eb(eb, bytenr,
+ *extent_item_pos, &eie);
+ ref->inode_list = eie;
+ free_extent_buffer(eb);
+ }
+ ret = ulist_add_merge(refs, ref->parent,
+ (unsigned long)ref->inode_list,
+ (unsigned long *)&eie, GFP_NOFS);
+ if (!ret && extent_item_pos) {
+ /*
+ * we've recorded that parent, so we must extend
+ * its inode list here
+ */
+ BUG_ON(!eie);
+ while (eie->next)
+ eie = eie->next;
+ eie->next = ref->inode_list;
+ }
BUG_ON(ret < 0);
}
kfree(ref);
}
out:
- if (head)
- mutex_unlock(&head->mutex);
btrfs_free_path(path);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
@@ -734,6 +949,28 @@ out:
return ret;
}
+static void free_leaf_list(struct ulist *blocks)
+{
+ struct ulist_node *node = NULL;
+ struct extent_inode_elem *eie;
+ struct extent_inode_elem *eie_next;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(blocks, &uiter))) {
+ if (!node->aux)
+ continue;
+ eie = (struct extent_inode_elem *)node->aux;
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+ node->aux = 0;
+ }
+
+ ulist_free(blocks);
+}
+
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +981,9 @@ out:
*/
static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **leafs)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **leafs,
+ const u64 *extent_item_pos)
{
struct ulist *tmp;
int ret;
@@ -758,11 +997,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, *leafs, tmp, extent_item_pos);
ulist_free(tmp);
if (ret < 0 && ret != -ENOENT) {
- ulist_free(*leafs);
+ free_leaf_list(*leafs);
return ret;
}
@@ -784,10 +1024,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*/
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **roots)
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
+ struct ulist_iterator uiter;
int ret;
tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1041,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
+ ULIST_ITER_INIT(&uiter);
while (1) {
- ret = find_parent_nodes(trans, fs_info, bytenr, seq,
- tmp, *roots);
+ ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+ time_seq, tmp, *roots, NULL);
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
return ret;
}
- node = ulist_next(tmp, node);
+ node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
@@ -1093,67 +1336,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 0;
}
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
- u64 orig_extent_item_objectid,
- u64 extent_item_pos, u64 root,
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+ u64 root, u64 extent_item_objectid,
iterate_extent_inodes_t *iterate, void *ctx)
{
- u64 disk_byte;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *eb;
- int slot;
- int nritems;
+ struct extent_inode_elem *eie;
int ret = 0;
- int extent_type;
- u64 data_offset;
- u64 data_len;
-
- eb = read_tree_block(fs_info->tree_root, logical,
- fs_info->tree_root->leafsize, 0);
- if (!eb)
- return -EIO;
-
- /*
- * from the shared data ref, we only have the leaf but we need
- * the key. thus, we must look into all items and see that we
- * find one (some) with a reference to our extent item.
- */
- nritems = btrfs_header_nritems(eb);
- for (slot = 0; slot < nritems; ++slot) {
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(eb, fi);
- if (extent_type == BTRFS_FILE_EXTENT_INLINE)
- continue;
- /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
- disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != orig_extent_item_objectid)
- continue;
-
- data_offset = btrfs_file_extent_offset(eb, fi);
- data_len = btrfs_file_extent_num_bytes(eb, fi);
-
- if (extent_item_pos < data_offset ||
- extent_item_pos >= data_offset + data_len)
- continue;
+ for (eie = inode_list; eie; eie = eie->next) {
pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
- "root %llu\n", orig_extent_item_objectid,
- key.objectid, key.offset, root);
- ret = iterate(key.objectid,
- key.offset + (extent_item_pos - data_offset),
- root, ctx);
+ "root %llu\n", extent_item_objectid,
+ eie->inum, eie->offset, root);
+ ret = iterate(eie->inum, eie->offset, root, ctx);
if (ret) {
- pr_debug("stopping iteration because ret=%d\n", ret);
+ pr_debug("stopping iteration for %llu due to ret=%d\n",
+ extent_item_objectid, ret);
break;
}
}
- free_extent_buffer(eb);
-
return ret;
}
@@ -1175,7 +1376,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list seq_elem;
+ struct seq_list seq_elem = {};
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist_iterator ref_uiter;
+ struct ulist_iterator root_uiter;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1396,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
btrfs_get_delayed_seq(delayed_refs, &seq_elem);
spin_unlock(&delayed_refs->lock);
+ btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- extent_item_pos, seq_elem.seq,
- &refs);
-
+ seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+ &extent_item_pos);
if (ret)
goto out;
- while (!ret && (ref_node = ulist_next(refs, ref_node))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
- seq_elem.seq, &roots);
+ ULIST_ITER_INIT(&ref_uiter);
+ while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+ ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ seq_elem.seq,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
- while (!ret && (root_node = ulist_next(roots, root_node))) {
- pr_debug("root %llu references leaf %llu\n",
- root_node->val, ref_node->val);
- ret = iterate_leaf_refs(fs_info, ref_node->val,
- extent_item_objectid,
- extent_item_pos, root_node->val,
- iterate, ctx);
+ ULIST_ITER_INIT(&root_uiter);
+ while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+ pr_debug("root %llu references leaf %llu, data list "
+ "%#lx\n", root_node->val, ref_node->val,
+ ref_node->aux);
+ ret = iterate_leaf_refs(
+ (struct extent_inode_elem *)ref_node->aux,
+ root_node->val, extent_item_objectid,
+ iterate, ctx);
}
+ ulist_free(roots);
+ roots = NULL;
}
- ulist_free(refs);
+ free_leaf_list(refs);
ulist_free(roots);
out:
if (!search_commit_root) {
+ btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_put_delayed_seq(delayed_refs, &seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4..c18d8ac7b79 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 seq, struct ulist **roots);
+ u64 delayed_ref_seq, u64 time_seq,
+ struct ulist **roots);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15fd520..12394a90d60 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,21 @@
#include "ordered-data.h"
#include "delayed-inode.h"
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
+#define BTRFS_INODE_ORPHAN_META_RESERVED 1
+#define BTRFS_INODE_DUMMY 2
+#define BTRFS_INODE_IN_DEFRAG 3
+#define BTRFS_INODE_DELALLOC_META_RESERVED 4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
+
/* in memory btrfs inode */
struct btrfs_inode {
/* which subvolume this inode belongs to */
@@ -57,9 +72,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
- /* for keeping track of orphaned inodes */
- struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
* to walk them all.
@@ -78,14 +90,13 @@ struct btrfs_inode {
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
+ unsigned long runtime_flags;
+
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
u64 generation;
- /* sequence number for NFS changes */
- u64 sequence;
-
/*
* transid of the trans_handle that last modified this inode
*/
@@ -145,22 +156,9 @@ struct btrfs_inode {
unsigned reserved_extents;
/*
- * ordered_data_close is set by truncate when a file that used
- * to have good data has been truncated to zero. When it is set
- * the btrfs file release call will add this inode to the
- * ordered operations list so that we make sure to flush out any
- * new data the application may have written before commit.
- */
- unsigned ordered_data_close:1;
- unsigned orphan_meta_reserved:1;
- unsigned dummy_inode:1;
- unsigned in_defrag:1;
- unsigned delalloc_meta_reserved:1;
-
- /*
* always compress this one file
*/
- unsigned force_compress:4;
+ unsigned force_compress;
struct btrfs_delayed_node *delayed_node;
@@ -202,4 +200,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
return false;
}
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == generation &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index c053e90f200..da6e9364a5e 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -93,6 +93,7 @@
#include "print-tree.h"
#include "locking.h"
#include "check-integrity.h"
+#include "rcu-string.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -103,8 +104,6 @@
#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
* excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
/*
@@ -210,8 +209,9 @@ struct btrfsic_block_data_ctx {
u64 dev_bytenr; /* physical bytenr on device */
u32 len;
struct btrfsic_dev_state *dev;
- char *data;
- struct buffer_head *bh; /* do not use if set to NULL */
+ char **datav;
+ struct page **pagev;
+ void *mem_to_free;
};
/* This structure is used to implement recursion without occupying
@@ -243,6 +243,8 @@ struct btrfsic_state {
struct btrfs_root *root;
u64 max_superblock_generation;
struct btrfsic_block *latest_superblock;
+ u32 metablock_size;
+ u32 datablock_size;
};
static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +292,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
static int btrfsic_process_metablock(struct btrfsic_state *state,
struct btrfsic_block *block,
struct btrfsic_block_data_ctx *block_ctx,
- struct btrfs_header *hdr,
int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dst, u32 offset, size_t len);
static int btrfsic_create_link_to_next_block(
struct btrfsic_state *state,
struct btrfsic_block *block,
@@ -318,12 +322,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- const u8 *data, unsigned int size);
+ char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, u8 *mapped_data,
- unsigned int len, struct bio *bio,
- int *bio_is_patched,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
@@ -375,7 +380,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
u64 bytenr,
struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char *data);
+ u64 dev_bytenr);
static struct mutex btrfsic_mutex;
static int btrfsic_is_initialized;
@@ -651,7 +656,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
BUG_ON(NULL == state);
- selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+ selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
return -1;
@@ -718,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -727,9 +732,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfsic_block *next_block;
struct btrfsic_block_data_ctx tmp_next_block_ctx;
struct btrfsic_block_link *l;
- struct btrfs_header *hdr;
- ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
&tmp_next_block_ctx,
mirror_num);
if (ret) {
@@ -758,7 +763,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
BUG_ON(NULL == l);
ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ if (ret < (int)PAGE_CACHE_SIZE) {
printk(KERN_INFO
"btrfsic: read @logical %llu failed!\n",
(unsigned long long)
@@ -768,11 +773,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
return -1;
}
- hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
ret = btrfsic_process_metablock(state,
next_block,
&tmp_next_block_ctx,
- hdr,
BTRFS_MAX_LEVEL + 3, 1);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
}
@@ -799,7 +802,10 @@ static int btrfsic_process_superblock_dev_mirror(
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+ if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+ return -1;
+ bh = __bread(superblock_bdev, dev_bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
if (NULL == bh)
return -1;
super_tmp = (struct btrfs_super_block *)
@@ -808,7 +814,10 @@ static int btrfsic_process_superblock_dev_mirror(
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
sizeof(super_tmp->magic)) ||
- memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+ btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+ btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+ btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
brelse(bh);
return 0;
}
@@ -835,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
- superblock_bdev, device->name,
- (unsigned long long)dev_bytenr,
- dev_state->name,
- (unsigned long long)dev_bytenr,
- superblock_mirror_num);
+ printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev,
+ rcu_str_deref(device->name),
+ (unsigned long long)dev_bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
btrfsic_block_hashtable_add(superblock_tmp,
@@ -893,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -902,7 +912,8 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfsic_block_data_ctx tmp_next_block_ctx;
struct btrfsic_block_link *l;
- if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ if (btrfsic_map_block(state, next_bytenr,
+ state->metablock_size,
&tmp_next_block_ctx,
mirror_num)) {
printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +977,15 @@ static int btrfsic_process_metablock(
struct btrfsic_state *state,
struct btrfsic_block *const first_block,
struct btrfsic_block_data_ctx *const first_block_ctx,
- struct btrfs_header *const first_hdr,
int first_limit_nesting, int force_iodone_flag)
{
struct btrfsic_stack_frame initial_stack_frame = { 0 };
struct btrfsic_stack_frame *sf;
struct btrfsic_stack_frame *next_stack;
+ struct btrfs_header *const first_hdr =
+ (struct btrfs_header *)first_block_ctx->datav[0];
+ BUG_ON(!first_hdr);
sf = &initial_stack_frame;
sf->error = 0;
sf->i = -1;
@@ -1012,21 +1025,47 @@ continue_with_current_leaf_stack_frame:
}
if (sf->i < sf->nr) {
- struct btrfs_item *disk_item = leafhdr->items + sf->i;
- struct btrfs_disk_key *disk_key = &disk_item->key;
+ struct btrfs_item disk_item;
+ u32 disk_item_offset =
+ (uintptr_t)(leafhdr->items + sf->i) -
+ (uintptr_t)leafhdr;
+ struct btrfs_disk_key *disk_key;
u8 type;
- const u32 item_offset = le32_to_cpu(disk_item->offset);
+ u32 item_offset;
+ if (disk_item_offset + sizeof(struct btrfs_item) >
+ sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+ printk(KERN_INFO
+ "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(sf->block_ctx,
+ &disk_item,
+ disk_item_offset,
+ sizeof(struct btrfs_item));
+ item_offset = le32_to_cpu(disk_item.offset);
+ disk_key = &disk_item.key;
type = disk_key->type;
if (BTRFS_ROOT_ITEM_KEY == type) {
- const struct btrfs_root_item *const root_item =
- (struct btrfs_root_item *)
- (sf->block_ctx->data +
- offsetof(struct btrfs_leaf, items) +
- item_offset);
- const u64 next_bytenr =
- le64_to_cpu(root_item->bytenr);
+ struct btrfs_root_item root_item;
+ u32 root_item_offset;
+ u64 next_bytenr;
+
+ root_item_offset = item_offset +
+ offsetof(struct btrfs_leaf, items);
+ if (root_item_offset +
+ sizeof(struct btrfs_root_item) >
+ sf->block_ctx->len)
+ goto leaf_item_out_of_bounce_error;
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &root_item,
+ root_item_offset,
+ sizeof(struct btrfs_root_item));
+ next_bytenr = le64_to_cpu(root_item.bytenr);
sf->error =
btrfsic_create_link_to_next_block(
@@ -1041,7 +1080,7 @@ continue_with_current_leaf_stack_frame:
&sf->num_copies,
&sf->mirror_num,
disk_key,
- le64_to_cpu(root_item->
+ le64_to_cpu(root_item.
generation));
if (sf->error)
goto one_stack_frame_backwards;
@@ -1049,7 +1088,7 @@ continue_with_current_leaf_stack_frame:
if (NULL != sf->next_block) {
struct btrfs_header *const next_hdr =
(struct btrfs_header *)
- sf->next_block_ctx.data;
+ sf->next_block_ctx.datav[0];
next_stack =
btrfsic_stack_frame_alloc();
@@ -1111,10 +1150,24 @@ continue_with_current_node_stack_frame:
}
if (sf->i < sf->nr) {
- struct btrfs_key_ptr *disk_key_ptr =
- nodehdr->ptrs + sf->i;
- const u64 next_bytenr =
- le64_to_cpu(disk_key_ptr->blockptr);
+ struct btrfs_key_ptr key_ptr;
+ u32 key_ptr_offset;
+ u64 next_bytenr;
+
+ key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+ (uintptr_t)nodehdr;
+ if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+ sf->block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ sf->block_ctx->start,
+ sf->block_ctx->dev->name);
+ goto one_stack_frame_backwards;
+ }
+ btrfsic_read_from_block_data(
+ sf->block_ctx, &key_ptr, key_ptr_offset,
+ sizeof(struct btrfs_key_ptr));
+ next_bytenr = le64_to_cpu(key_ptr.blockptr);
sf->error = btrfsic_create_link_to_next_block(
state,
@@ -1127,15 +1180,15 @@ continue_with_current_node_stack_frame:
force_iodone_flag,
&sf->num_copies,
&sf->mirror_num,
- &disk_key_ptr->key,
- le64_to_cpu(disk_key_ptr->generation));
+ &key_ptr.key,
+ le64_to_cpu(key_ptr.generation));
if (sf->error)
goto one_stack_frame_backwards;
if (NULL != sf->next_block) {
struct btrfs_header *const next_hdr =
(struct btrfs_header *)
- sf->next_block_ctx.data;
+ sf->next_block_ctx.datav[0];
next_stack = btrfsic_stack_frame_alloc();
if (NULL == next_stack)
@@ -1181,6 +1234,35 @@ one_stack_frame_backwards:
return sf->error;
}
+static void btrfsic_read_from_block_data(
+ struct btrfsic_block_data_ctx *block_ctx,
+ void *dstv, u32 offset, size_t len)
+{
+ size_t cur;
+ size_t offset_in_page;
+ char *kaddr;
+ char *dst = (char *)dstv;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+ WARN_ON(offset + len > block_ctx->len);
+ offset_in_page = (start_offset + offset) &
+ ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+ BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT);
+ kaddr = block_ctx->datav[i];
+ memcpy(dst, kaddr + offset_in_page, cur);
+
+ dst += cur;
+ len -= cur;
+ offset_in_page = 0;
+ i++;
+ }
+}
+
static int btrfsic_create_link_to_next_block(
struct btrfsic_state *state,
struct btrfsic_block *block,
@@ -1204,7 +1286,7 @@ static int btrfsic_create_link_to_next_block(
if (0 == *num_copiesp) {
*num_copiesp =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1301,7 @@ static int btrfsic_create_link_to_next_block(
"btrfsic_create_link_to_next_block(mirror_num=%d)\n",
*mirror_nump);
ret = btrfsic_map_block(state, next_bytenr,
- BTRFSIC_BLOCK_SIZE,
+ state->metablock_size,
next_block_ctx, *mirror_nump);
if (ret) {
printk(KERN_INFO
@@ -1314,7 +1396,7 @@ static int btrfsic_create_link_to_next_block(
if (limit_nesting > 0 && did_alloc_block_link) {
ret = btrfsic_read_block(state, next_block_ctx);
- if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ if (ret < (int)next_block_ctx->len) {
printk(KERN_INFO
"btrfsic: read block @logical %llu failed!\n",
(unsigned long long)next_bytenr);
@@ -1339,43 +1421,74 @@ static int btrfsic_handle_extent_data(
u32 item_offset, int force_iodone_flag)
{
int ret;
- struct btrfs_file_extent_item *file_extent_item =
- (struct btrfs_file_extent_item *)(block_ctx->data +
- offsetof(struct btrfs_leaf,
- items) + item_offset);
- u64 next_bytenr =
- le64_to_cpu(file_extent_item->disk_bytenr) +
- le64_to_cpu(file_extent_item->offset);
- u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
- u64 generation = le64_to_cpu(file_extent_item->generation);
+ struct btrfs_file_extent_item file_extent_item;
+ u64 file_extent_item_offset;
+ u64 next_bytenr;
+ u64 num_bytes;
+ u64 generation;
struct btrfsic_block_link *l;
+ file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+ item_offset;
+ if (file_extent_item_offset +
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+ ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+ file_extent_item.type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item.disk_bytenr));
+ return 0;
+ }
+
+ if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+ block_ctx->len) {
+ printk(KERN_INFO
+ "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+ block_ctx->start, block_ctx->dev->name);
+ return -1;
+ }
+ btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+ file_extent_item_offset,
+ sizeof(struct btrfs_file_extent_item));
+ next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+ le64_to_cpu(file_extent_item.offset);
+ generation = le64_to_cpu(file_extent_item.generation);
+ num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+ generation = le64_to_cpu(file_extent_item.generation);
+
if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
" offset = %llu, num_bytes = %llu\n",
- file_extent_item->type,
- (unsigned long long)
- le64_to_cpu(file_extent_item->disk_bytenr),
+ file_extent_item.type,
(unsigned long long)
- le64_to_cpu(file_extent_item->offset),
- (unsigned long long)
- le64_to_cpu(file_extent_item->num_bytes));
- if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
- ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
- return 0;
+ le64_to_cpu(file_extent_item.disk_bytenr),
+ (unsigned long long)le64_to_cpu(file_extent_item.offset),
+ (unsigned long long)num_bytes);
while (num_bytes > 0) {
u32 chunk_len;
int num_copies;
int mirror_num;
- if (num_bytes > BTRFSIC_BLOCK_SIZE)
- chunk_len = BTRFSIC_BLOCK_SIZE;
+ if (num_bytes > state->datablock_size)
+ chunk_len = state->datablock_size;
else
chunk_len = num_bytes;
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1588,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
block_ctx_out->dev_bytenr = multi->stripes[0].physical;
block_ctx_out->start = bytenr;
block_ctx_out->len = len;
- block_ctx_out->data = NULL;
- block_ctx_out->bh = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
if (0 == ret)
kfree(multi);
@@ -1496,8 +1610,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
block_ctx_out->dev_bytenr = bytenr;
block_ctx_out->start = bytenr;
block_ctx_out->len = len;
- block_ctx_out->data = NULL;
- block_ctx_out->bh = NULL;
+ block_ctx_out->datav = NULL;
+ block_ctx_out->pagev = NULL;
+ block_ctx_out->mem_to_free = NULL;
if (NULL != block_ctx_out->dev) {
return 0;
} else {
@@ -1508,38 +1623,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
{
- if (NULL != block_ctx->bh) {
- brelse(block_ctx->bh);
- block_ctx->bh = NULL;
+ if (block_ctx->mem_to_free) {
+ unsigned int num_pages;
+
+ BUG_ON(!block_ctx->datav);
+ BUG_ON(!block_ctx->pagev);
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ while (num_pages > 0) {
+ num_pages--;
+ if (block_ctx->datav[num_pages]) {
+ kunmap(block_ctx->pagev[num_pages]);
+ block_ctx->datav[num_pages] = NULL;
+ }
+ if (block_ctx->pagev[num_pages]) {
+ __free_page(block_ctx->pagev[num_pages]);
+ block_ctx->pagev[num_pages] = NULL;
+ }
+ }
+
+ kfree(block_ctx->mem_to_free);
+ block_ctx->mem_to_free = NULL;
+ block_ctx->pagev = NULL;
+ block_ctx->datav = NULL;
}
}
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx)
{
- block_ctx->bh = NULL;
- if (block_ctx->dev_bytenr & 4095) {
+ unsigned int num_pages;
+ unsigned int i;
+ u64 dev_bytenr;
+ int ret;
+
+ BUG_ON(block_ctx->datav);
+ BUG_ON(block_ctx->pagev);
+ BUG_ON(block_ctx->mem_to_free);
+ if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: read_block() with unaligned bytenr %llu\n",
(unsigned long long)block_ctx->dev_bytenr);
return -1;
}
- if (block_ctx->len > 4096) {
- printk(KERN_INFO
- "btrfsic: read_block() with too huge size %d\n",
- block_ctx->len);
+
+ num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+ sizeof(*block_ctx->pagev)) *
+ num_pages, GFP_NOFS);
+ if (!block_ctx->mem_to_free)
return -1;
+ block_ctx->datav = block_ctx->mem_to_free;
+ block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+ if (!block_ctx->pagev[i])
+ return -1;
}
- block_ctx->bh = __bread(block_ctx->dev->bdev,
- block_ctx->dev_bytenr >> 12, 4096);
- if (NULL == block_ctx->bh)
- return -1;
- block_ctx->data = block_ctx->bh->b_data;
+ dev_bytenr = block_ctx->dev_bytenr;
+ for (i = 0; i < num_pages;) {
+ struct bio *bio;
+ unsigned int j;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ bio = bio_alloc(GFP_NOFS, num_pages - i);
+ if (!bio) {
+ printk(KERN_INFO
+ "btrfsic: bio_alloc() for %u pages failed!\n",
+ num_pages - i);
+ return -1;
+ }
+ bio->bi_bdev = block_ctx->dev->bdev;
+ bio->bi_sector = dev_bytenr >> 9;
+ bio->bi_end_io = btrfsic_complete_bio_end_io;
+ bio->bi_private = &complete;
+
+ for (j = i; j < num_pages; j++) {
+ ret = bio_add_page(bio, block_ctx->pagev[j],
+ PAGE_CACHE_SIZE, 0);
+ if (PAGE_CACHE_SIZE != ret)
+ break;
+ }
+ if (j == i) {
+ printk(KERN_INFO
+ "btrfsic: error, failed to add a single page!\n");
+ return -1;
+ }
+ submit_bio(READ, bio);
+
+ /* this will also unplug the queue */
+ wait_for_completion(&complete);
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk(KERN_INFO
+ "btrfsic: read error at logical %llu dev %s!\n",
+ block_ctx->start, block_ctx->dev->name);
+ bio_put(bio);
+ return -1;
+ }
+ bio_put(bio);
+ dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ i = j;
+ }
+ for (i = 0; i < num_pages; i++) {
+ block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ if (!block_ctx->datav[i]) {
+ printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+ block_ctx->dev->name);
+ return -1;
+ }
+ }
return block_ctx->len;
}
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
static void btrfsic_dump_database(struct btrfsic_state *state)
{
struct list_head *elem_all;
@@ -1617,32 +1821,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
* (note that this test fails for the super block)
*/
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- const u8 *data, unsigned int size)
+ char **datav, unsigned int num_pages)
{
struct btrfs_header *h;
u8 csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
+ unsigned int i;
- h = (struct btrfs_header *)data;
+ if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ return 1; /* not metadata */
+ num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ h = (struct btrfs_header *)datav[0];
if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
- fail++;
+ return 1;
+
+ for (i = 0; i < num_pages; i++) {
+ u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_CACHE_SIZE :
+ (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
- crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ crc = crc32c(crc, data, sublen);
+ }
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
- crc_fail++;
+ return 1;
- return fail || crc_fail;
+ return 0; /* is metadata */
}
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr,
- u8 *mapped_data, unsigned int len,
- struct bio *bio,
- int *bio_is_patched,
+ u64 dev_bytenr, char **mapped_datav,
+ unsigned int num_pages,
+ struct bio *bio, int *bio_is_patched,
struct buffer_head *bh,
int submit_bio_bh_rw)
{
@@ -1652,12 +1863,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
int ret;
struct btrfsic_state *state = dev_state->state;
struct block_device *bdev = dev_state->bdev;
+ unsigned int processed_len;
- WARN_ON(len > PAGE_SIZE);
- is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
if (NULL != bio_is_patched)
*bio_is_patched = 0;
+again:
+ if (num_pages == 0)
+ return;
+
+ processed_len = 0;
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+ num_pages));
+
block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
&state->block_hashtable);
if (NULL != block) {
@@ -1667,8 +1885,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (block->is_superblock) {
bytenr = le64_to_cpu(((struct btrfs_super_block *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
+ if (num_pages * PAGE_CACHE_SIZE <
+ BTRFS_SUPER_INFO_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
is_metadata = 1;
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
printk(KERN_INFO
@@ -1678,12 +1904,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
if (is_metadata) {
if (!block->is_superblock) {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->metablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->metablock_size;
bytenr = le64_to_cpu(((struct btrfs_header *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
dev_state,
- dev_bytenr,
- mapped_data);
+ dev_bytenr);
}
if (block->logical_bytenr != bytenr) {
printk(KERN_INFO
@@ -1710,6 +1942,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->mirror_num,
btrfsic_get_block_type(state, block));
} else {
+ if (num_pages * PAGE_CACHE_SIZE <
+ state->datablock_size) {
+ printk(KERN_INFO
+ "btrfsic: cannot work with too short bios!\n");
+ return;
+ }
+ processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
@@ -1747,7 +1986,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
le64_to_cpu(block->disk_key.offset),
(unsigned long long)
le64_to_cpu(((struct btrfs_header *)
- mapped_data)->generation),
+ mapped_datav[0])->generation),
(unsigned long long)
state->max_superblock_generation);
btrfsic_dump_tree(state);
@@ -1765,10 +2004,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
(unsigned long long)block->generation,
(unsigned long long)
le64_to_cpu(((struct btrfs_header *)
- mapped_data)->generation));
+ mapped_datav[0])->generation));
/* it would not be safe to go on */
btrfsic_dump_tree(state);
- return;
+ goto continue_loop;
}
/*
@@ -1796,18 +2035,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
if (block->is_superblock)
- ret = btrfsic_map_superblock(state, bytenr, len,
+ ret = btrfsic_map_superblock(state, bytenr,
+ processed_len,
bdev, &block_ctx);
else
- ret = btrfsic_map_block(state, bytenr, len,
+ ret = btrfsic_map_block(state, bytenr, processed_len,
&block_ctx, 0);
if (ret) {
printk(KERN_INFO
"btrfsic: btrfsic_map_block(root @%llu)"
" failed!\n", (unsigned long long)bytenr);
- return;
+ goto continue_loop;
}
- block_ctx.data = mapped_data;
+ block_ctx.datav = mapped_datav;
/* the following is required in case of writes to mirrors,
* use the same that was used for the lookup */
block_ctx.dev = dev_state;
@@ -1863,11 +2103,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
block->logical_bytenr = bytenr;
block->is_metadata = 1;
if (block->is_superblock) {
+ BUG_ON(PAGE_CACHE_SIZE !=
+ BTRFS_SUPER_INFO_SIZE);
ret = btrfsic_process_written_superblock(
state,
block,
(struct btrfs_super_block *)
- mapped_data);
+ mapped_datav[0]);
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
printk(KERN_INFO
@@ -1880,8 +2122,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
state,
block,
&block_ctx,
- (struct btrfs_header *)
- block_ctx.data,
0, 0);
}
if (ret)
@@ -1912,26 +2152,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 bytenr;
if (!is_metadata) {
+ processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO "Written block (%s/%llu/?)"
" !found in hash table, D.\n",
dev_state->name,
(unsigned long long)dev_bytenr);
- if (!state->include_extent_data)
- return; /* ignore that written D block */
+ if (!state->include_extent_data) {
+ /* ignore that written D block */
+ goto continue_loop;
+ }
/* this is getting ugly for the
* include_extent_data case... */
bytenr = 0; /* unknown */
block_ctx.start = bytenr;
- block_ctx.len = len;
- block_ctx.bh = NULL;
+ block_ctx.len = processed_len;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.pagev = NULL;
} else {
+ processed_len = state->metablock_size;
bytenr = le64_to_cpu(((struct btrfs_header *)
- mapped_data)->bytenr);
+ mapped_datav[0])->bytenr);
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
- dev_bytenr,
- mapped_data);
+ dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2184,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
dev_state->name,
(unsigned long long)dev_bytenr);
- ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
- 0);
+ ret = btrfsic_map_block(state, bytenr, processed_len,
+ &block_ctx, 0);
if (ret) {
printk(KERN_INFO
"btrfsic: btrfsic_map_block(root @%llu)"
" failed!\n",
(unsigned long long)dev_bytenr);
- return;
+ goto continue_loop;
}
}
- block_ctx.data = mapped_data;
+ block_ctx.datav = mapped_datav;
/* the following is required in case of writes to mirrors,
* use the same that was used for the lookup */
block_ctx.dev = dev_state;
@@ -1960,7 +2204,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (NULL == block) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
- return;
+ goto continue_loop;
}
block->dev_state = dev_state;
block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2264,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
if (is_metadata) {
ret = btrfsic_process_metablock(state, block,
- &block_ctx,
- (struct btrfs_header *)
- block_ctx.data, 0, 0);
+ &block_ctx, 0, 0);
if (ret)
printk(KERN_INFO
"btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2273,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
}
btrfsic_release_block_ctx(&block_ctx);
}
+
+continue_loop:
+ BUG_ON(!processed_len);
+ dev_bytenr += processed_len;
+ mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+ num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ goto again;
}
static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2462,7 @@ static int btrfsic_process_written_superblock(
num_copies =
btrfs_num_copies(&state->root->fs_info->mapping_tree,
- next_bytenr, PAGE_SIZE);
+ next_bytenr, BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
(unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2473,8 @@ static int btrfsic_process_written_superblock(
printk(KERN_INFO
"btrfsic_process_written_superblock("
"mirror_num=%d)\n", mirror_num);
- ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE,
&tmp_next_block_ctx,
mirror_num);
if (ret) {
@@ -2689,7 +2939,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
u64 bytenr,
struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char *data)
+ u64 dev_bytenr)
{
int num_copies;
int mirror_num;
@@ -2698,10 +2948,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
int match = 0;
num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
- bytenr, PAGE_SIZE);
+ bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, bytenr, state->metablock_size,
&block_ctx, mirror_num);
if (ret) {
printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2977,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
(unsigned long long)bytenr, dev_state->name,
(unsigned long long)dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ ret = btrfsic_map_block(state, bytenr,
+ state->metablock_size,
&block_ctx, mirror_num);
if (ret)
continue;
@@ -2781,13 +3032,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
(unsigned long)bh->b_size, bh->b_data,
bh->b_bdev);
btrfsic_process_written_block(dev_state, dev_bytenr,
- bh->b_data, bh->b_size, NULL,
+ &bh->b_data, 1, NULL,
NULL, bh, rw);
} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+ "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
rw, bh->b_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
@@ -2836,6 +3087,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
unsigned int i;
u64 dev_bytenr;
int bio_is_patched;
+ char **mapped_datav;
dev_bytenr = 512 * bio->bi_sector;
bio_is_patched = 0;
@@ -2848,35 +3100,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
(unsigned long long)dev_bytenr,
bio->bi_bdev);
+ mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+ GFP_NOFS);
+ if (!mapped_datav)
+ goto leave;
for (i = 0; i < bio->bi_vcnt; i++) {
- u8 *mapped_data;
-
- mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+ if (!mapped_datav[i]) {
+ while (i > 0) {
+ i--;
+ kunmap(bio->bi_io_vec[i].bv_page);
+ }
+ kfree(mapped_datav);
+ goto leave;
+ }
if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE) ==
(dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
printk(KERN_INFO
- "#%u: page=%p, mapped=%p, len=%u,"
- " offset=%u\n",
+ "#%u: page=%p, len=%u, offset=%u\n",
i, bio->bi_io_vec[i].bv_page,
- mapped_data,
bio->bi_io_vec[i].bv_len,
bio->bi_io_vec[i].bv_offset);
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_data,
- bio->bi_io_vec[i].bv_len,
- bio, &bio_is_patched,
- NULL, rw);
+ }
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_datav, bio->bi_vcnt,
+ bio, &bio_is_patched,
+ NULL, rw);
+ while (i > 0) {
+ i--;
kunmap(bio->bi_io_vec[i].bv_page);
- dev_bytenr += bio->bi_io_vec[i].bv_len;
}
+ kfree(mapped_datav);
} else if (NULL != dev_state && (rw & REQ_FLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
printk(KERN_INFO
- "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+ "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
rw, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
@@ -2903,6 +3166,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
bio->bi_end_io = btrfsic_bio_end_io;
}
}
+leave:
mutex_unlock(&btrfsic_mutex);
submit_bio(rw, bio);
@@ -2917,6 +3181,30 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
+ if (root->nodesize != root->leafsize) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+ root->nodesize, root->leafsize);
+ return -1;
+ }
+ if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
+ if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+ root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+ return -1;
+ }
state = kzalloc(sizeof(*state), GFP_NOFS);
if (NULL == state) {
printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3221,8 @@ int btrfsic_mount(struct btrfs_root *root,
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
state->csum_size = 0;
+ state->metablock_size = root->nodesize;
+ state->datablock_size = root->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
btrfsic_block_hashtable_init(&state->block_hashtable);
btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3339,7 @@ void btrfsic_unmount(struct btrfs_root *root,
btrfsic_block_link_free(l);
}
- if (b_all->is_iodone)
+ if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4106264fbc6..8206b390058 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/rbtree.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot);
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+ u32 blocksize, u64 parent_transid,
+ u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+ u64 bytenr, u32 blocksize,
+ u64 time_seq);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
new_root_objectid, &disk_key, level,
- buf->start, 0, 1);
+ buf->start, 0);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -288,6 +298,449 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return 0;
}
+enum mod_log_op {
+ MOD_LOG_KEY_REPLACE,
+ MOD_LOG_KEY_ADD,
+ MOD_LOG_KEY_REMOVE,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ MOD_LOG_MOVE_KEYS,
+ MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+ int dst_slot;
+ int nr_items;
+};
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 index; /* shifted logical */
+ struct seq_list elem;
+ enum mod_log_op op;
+
+ /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+ int slot;
+
+ /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+ u64 generation;
+
+ /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* this is used for op == MOD_LOG_MOVE_KEYS */
+ struct tree_mod_move move;
+
+ /* this is used for op == MOD_LOG_ROOT_REPLACE */
+ struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+ elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ elem->flags = 1;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ __get_tree_mod_seq(fs_info, elem);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct seq_list *cur_elem;
+ struct tree_mod_elem *tm;
+ u64 min_seq = (u64)-1;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ BUG_ON(!(elem->flags & 1));
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ list_del(&elem->list);
+
+ list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+ if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+ if (seq_putting > cur_elem->seq) {
+ /*
+ * blocker with lower sequence number exists, we
+ * cannot remove anything from the log
+ */
+ goto out;
+ }
+ min_seq = cur_elem->seq;
+ }
+ }
+
+ /*
+ * anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = container_of(node, struct tree_mod_elem, node);
+ if (tm->elem.seq > min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ list_del(&tm->elem.list);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+out:
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ * index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+ int ret = 0;
+
+ BUG_ON(!tm || !tm->elem.seq);
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = container_of(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->index < tm->index)
+ new = &((*new)->rb_left);
+ else if (cur->index > tm->index)
+ new = &((*new)->rb_right);
+ else if (cur->elem.seq < tm->elem.seq)
+ new = &((*new)->rb_left);
+ else if (cur->elem.seq > tm->elem.seq)
+ new = &((*new)->rb_right);
+ else {
+ kfree(tm);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+unlock:
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb) {
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 1;
+ if (!eb)
+ return 0;
+ if (btrfs_header_level(eb) == 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * This allocates memory and gets a tree modification sequence number when
+ * needed.
+ *
+ * Returns 0 when no sequence number is needed, < 0 on error.
+ * Returns 1 when a sequence number was added. In this case,
+ * fs_info->tree_mod_seq_lock was acquired and must be released by the caller
+ * after inserting into the rb tree.
+ */
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+ struct tree_mod_elem **tm_ret)
+{
+ struct tree_mod_elem *tm;
+ int seq;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return 0;
+
+ tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return -ENOMEM;
+
+ tm->elem.flags = 0;
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ /*
+ * someone emptied the list while we were waiting for the lock.
+ * we must not add to the list, because no blocker exists. items
+ * are removed from the list only when the existing blocker is
+ * removed from the list.
+ */
+ kfree(tm);
+ seq = 0;
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ } else {
+ __get_tree_mod_seq(fs_info, &tm->elem);
+ seq = tm->elem.seq;
+ }
+
+ return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ if (op != MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ int slot, enum mod_log_op op)
+{
+ return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int dst_slot, int src_slot,
+ int nr_items, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return 0;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+ BUG_ON(ret < 0);
+ }
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = MOD_LOG_MOVE_KEYS;
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *old_root,
+ struct extent_buffer *new_root, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ ret = tree_mod_alloc(fs_info, flags, &tm);
+ if (ret <= 0)
+ return ret;
+
+ tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = MOD_LOG_ROOT_REPLACE;
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+ return ret;
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+ int smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+ u64 index = start >> PAGE_CACHE_SHIFT;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = container_of(node, struct tree_mod_elem, node);
+ if (cur->index < index) {
+ node = node->rb_left;
+ } else if (cur->index > index) {
+ node = node->rb_right;
+ } else if (cur->elem.seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* we want the node with the highest seq */
+ if (found)
+ BUG_ON(found->elem.seq > cur->elem.seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->elem.seq > min_seq) {
+ /* we want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->elem.seq < cur->elem.seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+ u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ struct extent_buffer *src, unsigned long dst_offset,
+ unsigned long src_offset, int nr_items)
+{
+ int ret;
+ int i;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ return;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return;
+
+ /* speed this up by single seq for all operations? */
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
+ ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+ int dst_offset, int src_offset, int nr_items)
+{
+ int ret;
+ ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+ nr_items, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+ int ret;
+
+ ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+ MOD_LOG_KEY_REPLACE,
+ atomic ? GFP_ATOMIC : GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ int i;
+ int ret;
+ u32 nritems;
+
+ if (tree_mod_dont_log(fs_info, eb))
+ return;
+
+ nritems = btrfs_header_nritems(eb);
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert_key(fs_info, eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+ BUG_ON(ret < 0);
+ }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+ struct extent_buffer *new_root_node)
+{
+ int ret;
+ tree_mod_log_free_eb(root->fs_info, root->node);
+ ret = tree_mod_log_insert_root(root->fs_info, root->node,
+ new_root_node, GFP_NOFS);
+ BUG_ON(ret < 0);
+}
+
/*
* check if the tree block can be shared by multiple trees
*/
@@ -409,6 +862,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1, 1);
BUG_ON(ret); /* -ENOMEM */
}
+ /*
+ * don't log freeing in case we're freeing the root node, this
+ * is done by tree_mod_log_set_root_pointer later
+ */
+ if (buf != root->node && btrfs_header_level(buf) != 0)
+ tree_mod_log_free_eb(root->fs_info, buf);
clean_tree_block(trans, root, buf);
*last_ref = 1;
}
@@ -467,7 +926,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
root->root_key.objectid, &disk_key,
- level, search_start, empty_size, 1);
+ level, search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -506,10 +965,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
extent_buffer_get(cow);
+ tree_mod_log_set_root_pointer(root, cow);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref, 1);
+ last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -519,13 +979,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = 0;
WARN_ON(trans->transid != btrfs_header_generation(parent));
+ tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+ MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref, 1);
+ last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -535,6 +997,229 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = root->node->start;
+ int looped = 0;
+
+ if (!time_seq)
+ return 0;
+
+ /*
+ * the very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). this has the index of the *new*
+ * root, making it the very first operation that's logged for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return 0;
+ /*
+ * if there are no tree operation for the oldest root, we simply
+ * return it. this should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * if there's an operation that's not a root replacement, we
+ * found the oldest version of our root. normally, we'll find a
+ * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ BUG_ON(root_logical == root->node->start);
+ looped = 1;
+ }
+
+ /* if there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ while (tm && tm->elem.seq >= time_seq) {
+ /*
+ * all the operations are recorded with the operator used for
+ * the modification. as we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case MOD_LOG_ROOT_REPLACE:
+ /*
+ * this operation is special. for roots, this must be
+ * handled explicitly before rewinding.
+ * for non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. in the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. we simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = container_of(next, struct tree_mod_elem, node);
+ if (tm->index != first_tm->index)
+ break;
+ }
+ btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(eb->start,
+ fs_info->tree_root->nodesize);
+ BUG_ON(!eb_rewin);
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ BUG_ON(!eb_rewin);
+ }
+
+ extent_buffer_get(eb_rewin);
+ free_extent_buffer(eb);
+
+ __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+ return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+
+ eb = btrfs_read_lock_root_node(root);
+ tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+ if (!tm)
+ return root->node;
+
+ if (tm->op == MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ } else {
+ logical = root->node->start;
+ }
+
+ tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+ if (old_root)
+ eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ else
+ eb = btrfs_clone_extent_buffer(root->node);
+ btrfs_tree_read_unlock(root->node);
+ free_extent_buffer(root->node);
+ if (!eb)
+ return NULL;
+ btrfs_tree_read_lock(eb);
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, root->root_key.objectid);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ if (tm)
+ __tree_mod_log_rewind(eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ extent_buffer_get(eb);
+
+ return eb;
+}
+
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -739,7 +1424,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur)
return -EIO;
} else if (!uptodate) {
- btrfs_read_buffer(cur, gen);
+ err = btrfs_read_buffer(cur, gen);
+ if (err) {
+ free_extent_buffer(cur);
+ return err;
+ }
}
}
if (search_start == 0)
@@ -854,20 +1543,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot)
{
- if (level == 0) {
+ if (level == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item),
key, btrfs_header_nritems(eb),
slot);
- } else {
+ else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
sizeof(struct btrfs_key_ptr),
key, btrfs_header_nritems(eb),
slot);
- }
- return -1;
}
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1661,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
+ tree_mod_log_set_root_pointer(root, child);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -987,7 +1675,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -996,8 +1684,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- btrfs_header_nritems(mid);
-
left = read_node_slot(root, parent, pslot - 1);
if (left) {
btrfs_tree_lock(left);
@@ -1027,7 +1713,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
wret = push_node_left(trans, root, left, mid, 1);
if (wret < 0)
ret = wret;
- btrfs_header_nritems(mid);
}
/*
@@ -1040,14 +1725,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(right) == 0) {
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- del_ptr(trans, root, path, level + 1, pslot + 1);
+ del_ptr(trans, root, path, level + 1, pslot + 1, 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &right_key, pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -1082,15 +1769,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (btrfs_header_nritems(mid) == 0) {
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- del_ptr(trans, root, path, level + 1, pslot);
+ del_ptr(trans, root, path, level + 1, pslot, 1);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+ pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -1188,6 +1877,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1930,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
+ tree_mod_log_set_node_key(root->fs_info, parent,
+ &disk_key, pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1496,7 +2189,7 @@ static int
read_block_for_search(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot,
- struct btrfs_key *key)
+ struct btrfs_key *key, u64 time_seq)
{
u64 blocknr;
u64 gen;
@@ -1850,7 +2543,7 @@ cow_done:
}
err = read_block_for_search(trans, root, p,
- &b, level, slot, key);
+ &b, level, slot, key, 0);
if (err == -EAGAIN)
goto again;
if (err) {
@@ -1922,6 +2615,113 @@ done:
}
/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq)
+{
+ struct extent_buffer *b;
+ int slot;
+ int ret;
+ int err;
+ int level;
+ int lowest_unlock = 1;
+ u8 lowest_level = 0;
+
+ lowest_level = p->lowest_level;
+ WARN_ON(p->nodes[0] != NULL);
+
+ if (p->search_commit_root) {
+ BUG_ON(time_seq);
+ return btrfs_search_slot(NULL, root, key, p, 0, 0);
+ }
+
+again:
+ b = get_old_root(root, time_seq);
+ level = btrfs_header_level(b);
+ p->locks[level] = BTRFS_READ_LOCK;
+
+ while (b) {
+ level = btrfs_header_level(b);
+ p->nodes[level] = b;
+ btrfs_clear_path_blocking(p, NULL, 0);
+
+ /*
+ * we have a lock on b and as long as we aren't changing
+ * the tree, there is no way to for the items in b to change.
+ * It is safe to drop the lock on our parent before we
+ * go through the expensive btree search on b.
+ */
+ btrfs_unlock_up_safe(p, level + 1);
+
+ ret = bin_search(b, key, level, &slot);
+
+ if (level != 0) {
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
+ slot -= 1;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(NULL, root, p, &b, level,
+ slot, key, time_seq);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ err = btrfs_try_tree_read_lock(b);
+ if (!err) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ btrfs_clear_path_blocking(p, b,
+ BTRFS_READ_LOCK);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
+ b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+ if (b != p->nodes[level]) {
+ btrfs_tree_unlock_rw(p->nodes[level],
+ p->locks[level]);
+ p->locks[level] = 0;
+ p->nodes[level] = b;
+ }
+ } else {
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
+ }
+ ret = 1;
+done:
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
+ if (ret < 0)
+ btrfs_release_path(p);
+
+ return ret;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2741,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
if (!path->nodes[i])
break;
t = path->nodes[i];
+ tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -2023,12 +2824,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
+ tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+ src_nritems - push_items);
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
@@ -2082,11 +2887,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
+ tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ src_nritems - push_items, push_items);
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2937,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid, &lower_key,
- level, root->node->start, 0, 0);
+ level, root->node->start, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2161,6 +2969,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
+ tree_mod_log_set_root_pointer(root, c);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -2188,6 +2997,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
{
struct extent_buffer *lower;
int nritems;
+ int ret;
BUG_ON(!path->nodes[level]);
btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +3006,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
if (slot != nritems) {
+ if (level)
+ tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+ slot, nritems - slot);
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
+ if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+ MOD_LOG_KEY_ADD);
+ BUG_ON(ret < 0);
+ }
btrfs_set_node_key(lower, key, slot);
btrfs_set_node_blockptr(lower, slot, bytenr);
WARN_ON(trans->transid == 0);
@@ -2252,7 +3070,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid,
- &disk_key, level, c->start, 0, 0);
+ &disk_key, level, c->start, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -2271,7 +3089,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
-
+ tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
@@ -3004,7 +3822,7 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
- &disk_key, 0, l->start, 0, 0);
+ &disk_key, 0, l->start, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -3749,19 +4567,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
* empty a node.
*/
static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct btrfs_path *path, int level, int slot)
+ struct btrfs_path *path, int level, int slot,
+ int tree_mod_log)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
+ int ret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
+ if (tree_mod_log && level)
+ tree_mod_log_eb_move(root->fs_info, parent, slot,
+ slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
+ } else if (tree_mod_log && level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
}
+
nritems--;
btrfs_set_header_nritems(parent, nritems);
if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4621,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid);
- del_ptr(trans, root, path, 1, path->slots[1]);
+ del_ptr(trans, root, path, 1, path->slots[1], 1);
/*
* btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4632,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
extent_buffer_get(leaf);
- btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4202,6 +5030,12 @@ next:
*/
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq)
+{
int slot;
int level;
struct extent_buffer *c;
@@ -4226,7 +5060,10 @@ again:
path->keep_locks = 1;
path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (time_seq)
+ ret = btrfs_search_old_slot(root, &key, path, time_seq);
+ else
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->keep_locks = 0;
if (ret < 0)
@@ -4271,7 +5108,7 @@ again:
next = c;
next_rw_lock = path->locks[level];
ret = read_block_for_search(NULL, root, path, &next, level,
- slot, &key);
+ slot, &key, 0);
if (ret == -EAGAIN)
goto again;
@@ -4282,6 +5119,18 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && time_seq) {
+ /*
+ * If we don't get the lock, we may be racing
+ * with push_leaf_left, holding that lock while
+ * itself waiting for the leaf we've currently
+ * locked. To solve this situation, we give up
+ * on our lock and cycle.
+ */
+ btrfs_release_path(path);
+ cond_resched();
+ goto again;
+ }
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
@@ -4308,7 +5157,7 @@ again:
break;
ret = read_block_for_search(NULL, root, path, &next, level,
- 0, &key);
+ 0, &key, 0);
if (ret == -EAGAIN)
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d60..fa5c45b3907 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_FT_XATTR 8
#define BTRFS_FT_MAX 9
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
u8 csum;
} __attribute__ ((__packed__));
+struct btrfs_dev_stats_item {
+ /*
+ * grow this item struct at the end for future enhancements and keep
+ * the existing values unchanged
+ */
+ __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
+ /* this protects tree_mod_seq_list */
+ spinlock_t tree_mod_seq_lock;
+ atomic_t tree_mod_seq;
+ struct list_head tree_mod_seq_list;
+
+ /* this protects tree_mod_log */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
+
atomic_t nr_async_submits;
atomic_t async_submit_draining;
atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
struct list_head root_list;
spinlock_t orphan_lock;
- struct list_head orphan_list;
+ atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_BALANCE_ITEM_KEY 248
/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY 249
+
+/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index)
+{
+ u64 val;
+
+ read_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+ return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+ struct btrfs_dev_stats_item *ptr,
+ int index, u64 val)
+{
+ write_extent_buffer(eb, &val,
+ offsetof(struct btrfs_dev_stats_item, values) +
+ ((unsigned long)ptr) + (index * sizeof(u64)),
+ sizeof(val));
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size, int for_cow);
+ u64 hint, u64 empty_size);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref, int for_cow);
+ u64 parent, int last_ref);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *p, u64 time_seq);
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
@@ -2701,13 +2753,20 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+ u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+ struct btrfs_path *p, u64 time_seq)
{
++p->slots[0];
if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
- return btrfs_next_leaf(root, p);
+ return btrfs_next_old_leaf(root, p, time_seq);
return 0;
}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ return btrfs_next_old_item(root, p, 0);
+}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
@@ -2922,7 +2981,6 @@ int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3156,23 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
+/* delayed seq elem */
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+ u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+ if (rootid == BTRFS_FS_TREE_OBJECTID ||
+ (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+ return 0;
+}
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748d84d..2399f408691 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
- if (BTRFS_I(inode)->delalloc_meta_reserved) {
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
spin_unlock(&BTRFS_I(inode)->lock);
release = true;
goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
btrfs_set_stack_inode_generation(inode_item,
BTRFS_I(inode)->generation);
- btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+ btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+ inode->i_version = btrfs_stack_inode_sequence(inode_item);
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
@@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
}
}
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node) {
+ __btrfs_kill_delayed_node(curr_node);
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+}
+
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7083d08b2a2..f5aa4023d3e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev);
/* Used for drop dead root */
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
/* Used for readdir() */
void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
struct list_head *del_list);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3b..13ae7b04790 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (need_ref_seq(for_cow, ref_root))
+ if (is_fstree(ref_root))
seq = inc_delayed_seq(delayed_refs);
ref->seq = seq;
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
- if (need_ref_seq(for_cow, ref_root))
+ if (is_fstree(ref_root))
seq = inc_delayed_seq(delayed_refs);
ref->seq = seq;
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
for_cow);
- if (!need_ref_seq(for_cow, ref_root) &&
+ if (!is_fstree(ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
action, for_cow);
- if (!need_ref_seq(for_cow, ref_root) &&
+ if (!is_fstree(ref_root) &&
waitqueue_active(&delayed_refs->seq_wait))
wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
+
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d9492..413927fb995 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
-struct seq_list {
- struct list_head list;
- u64 seq;
-};
-
static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
{
assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
u64 seq);
/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
- if (for_cow)
- return 0;
-
- if (rootid == BTRFS_FS_TREE_OBJECTID)
- return 1;
-
- if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
- return 1;
-
- return 0;
-}
-
-/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
*/
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88a7db..2936ca49b3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "check-integrity.h"
+#include "rcu-string.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -1153,7 +1154,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
- INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1166,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+ atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
@@ -1252,7 +1253,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
- 0, 0, 0, 0);
+ 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1914,11 +1915,14 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->free_chunk_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv);
btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1935,14 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
fs_info->free_chunk_space = 0;
+ fs_info->tree_mod_log = RB_ROOT;
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2007,8 @@ int open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
- BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2112,7 +2119,7 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
/*
@@ -2347,12 +2354,24 @@ retry_root_backup:
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
goto recovery_tree_root;
-
csum_root->track_dirty = 1;
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ ret = btrfs_recover_balance(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to recover balance\n");
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_init_dev_stats(fs_info);
+ if (ret) {
+ printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+ ret);
+ goto fail_block_groups;
+ }
+
ret = btrfs_init_space_info(fs_info);
if (ret) {
printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2471,20 +2490,23 @@ retry_root_backup:
goto fail_trans_kthread;
}
- if (!(sb->s_flags & MS_RDONLY)) {
- down_read(&fs_info->cleanup_work_sem);
- err = btrfs_orphan_cleanup(fs_info->fs_root);
- if (!err)
- err = btrfs_orphan_cleanup(fs_info->tree_root);
- up_read(&fs_info->cleanup_work_sem);
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
- if (!err)
- err = btrfs_recover_balance(fs_info->tree_root);
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ close_ctree(tree_root);
+ return ret;
+ }
+ up_read(&fs_info->cleanup_work_sem);
- if (err) {
- close_ctree(tree_root);
- return err;
- }
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret) {
+ printk(KERN_WARNING "btrfs: failed to resume balance\n");
+ close_ctree(tree_root);
+ return ret;
}
return 0;
@@ -2556,18 +2578,20 @@ recovery_tree_root:
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- printk_ratelimited(KERN_WARNING "lost page write due to "
- "I/O error on %s\n",
- bdevname(bh->b_bdev, b));
+ struct btrfs_device *device = (struct btrfs_device *)
+ bh->b_private;
+
+ printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+ "I/O error on %s\n",
+ rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
clear_buffer_uptodate(bh);
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
}
unlock_buffer(bh);
put_bh(bh);
@@ -2682,6 +2706,7 @@ static int write_dev_supers(struct btrfs_device *device,
set_buffer_uptodate(bh);
lock_buffer(bh);
bh->b_end_io = btrfs_end_buffer_write_sync;
+ bh->b_private = device;
}
/*
@@ -2734,12 +2759,15 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk("btrfs: disabling barriers on dev %s\n",
- device->name);
+ printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+ rcu_str_deref(device->name));
device->nobarriers = 1;
}
if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
+ if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
}
/* drop the reference from the wait == 0 run */
@@ -2753,7 +2781,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
* one reference for us, and we leave it for the
* caller
*/
- device->flush_bio = NULL;;
+ device->flush_bio = NULL;
bio = bio_alloc(GFP_NOFS, 0);
if (!bio)
return -ENOMEM;
@@ -2902,19 +2930,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
return ret;
}
-/* Kill all outstanding I/O */
-void btrfs_abort_devices(struct btrfs_root *root)
-{
- struct list_head *head;
- struct btrfs_device *dev;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- head = &root->fs_info->fs_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- blk_abort_queue(dev->bdev->bd_disk->queue);
- }
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-}
-
void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3395,7 +3410,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs;
-again:
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
spin_unlock(&delayed_refs->lock);
@@ -3403,31 +3417,37 @@ again:
return ret;
}
- node = rb_first(&delayed_refs->root);
- while (node) {
+ while ((node = rb_first(&delayed_refs->root)) != NULL) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
- ref->in_tree = 0;
- rb_erase(&ref->rb_node, &delayed_refs->root);
- delayed_refs->num_entries--;
atomic_set(&ref->refs, 1);
if (btrfs_delayed_ref_is_head(ref)) {
struct btrfs_delayed_ref_head *head;
head = btrfs_delayed_node_to_head(ref);
- spin_unlock(&delayed_refs->lock);
- mutex_lock(&head->mutex);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ /* Need to wait for the delayed ref to run */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
kfree(head->extent_op);
delayed_refs->num_heads--;
if (list_empty(&head->cluster))
delayed_refs->num_heads_ready--;
list_del_init(&head->cluster);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- goto again;
}
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
spin_unlock(&delayed_refs->lock);
btrfs_put_delayed_ref(ref);
@@ -3515,11 +3535,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
&(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
offset >> PAGE_CACHE_SHIFT);
spin_unlock(&dirty_pages->buffer_lock);
- if (eb) {
+ if (eb)
ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
&eb->bflags);
- atomic_set(&eb->refs, 1);
- }
if (PageWriteback(page))
end_page_writeback(page);
@@ -3533,8 +3551,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
spin_unlock_irq(&page->mapping->tree_lock);
}
- page->mapping->a_ops->invalidatepage(page, 0);
unlock_page(page);
+ page_cache_release(page);
}
}
@@ -3548,8 +3566,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
u64 start;
u64 end;
int ret;
+ bool loop = true;
unpin = pinned_extents;
+again:
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3567,6 +3587,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
cond_resched();
}
+ if (loop) {
+ if (unpin == &root->fs_info->freed_extents[0])
+ unpin = &root->fs_info->freed_extents[1];
+ else
+ unpin = &root->fs_info->freed_extents[0];
+ loop = false;
+ goto again;
+ }
+
return 0;
}
@@ -3580,21 +3609,23 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
/* FIXME: cleanup wait for commit */
cur_trans->in_commit = 1;
cur_trans->blocked = 1;
- if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
- wake_up(&root->fs_info->transaction_blocked_wait);
+ wake_up(&root->fs_info->transaction_blocked_wait);
cur_trans->blocked = 0;
- if (waitqueue_active(&root->fs_info->transaction_wait))
- wake_up(&root->fs_info->transaction_wait);
+ wake_up(&root->fs_info->transaction_wait);
cur_trans->commit_done = 1;
- if (waitqueue_active(&cur_trans->commit_wait))
- wake_up(&cur_trans->commit_wait);
+ wake_up(&cur_trans->commit_wait);
+
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
btrfs_destroy_pending_snapshots(cur_trans);
btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
EXTENT_DIRTY);
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
/*
memset(cur_trans, 0, sizeof(*cur_trans));
@@ -3643,6 +3674,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
if (waitqueue_active(&t->commit_wait))
wake_up(&t->commit_wait);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+
btrfs_destroy_pending_snapshots(t);
btrfs_destroy_delalloc_inodes(root);
@@ -3671,17 +3705,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
-static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
- u64 start, u64 end,
- struct extent_state *state)
-{
- struct super_block *sb = page->mapping->host->i_sb;
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- btrfs_error(fs_info, -EIO,
- "Error occured while writing out btree at %llu", start);
- return -EIO;
-}
-
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3712,4 @@ static struct extent_io_ops btree_extent_io_ops = {
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,
- .writepage_io_failed_hook = btree_writepage_io_failed_hook,
};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ab1830aaf0e..05b3fab39f7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
int btrfs_cleanup_transaction(struct btrfs_root *root);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e887ee62b6d..614f34a899c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -13,15 +13,14 @@
parent_root_objectid) / 4)
#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
- int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
struct btrfs_fid *fid = (struct btrfs_fid *)fh;
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type;
- if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
*max_len = BTRFS_FID_SIZE_CONNECTABLE;
return 255;
} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
+ if (parent) {
u64 parent_root_id;
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
parent_root_id = BTRFS_I(parent)->root->objectid;
- spin_unlock(&dentry->d_lock);
-
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49fd7b66d57..6e1d36702ff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2347,12 +2347,10 @@ next:
return count;
}
-
static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
- unsigned long num_refs)
+ unsigned long num_refs,
+ struct list_head *first_seq)
{
- struct list_head *first_seq = delayed_refs->seq_head.next;
-
spin_unlock(&delayed_refs->lock);
pr_debug("waiting for more refs (num %ld, first %p)\n",
num_refs, first_seq);
@@ -2381,6 +2379,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
+ struct list_head *first_seq = NULL;
int ret;
u64 delayed_start;
int run_all = count == (unsigned long)-1;
@@ -2436,8 +2435,10 @@ again:
*/
consider_waiting = 1;
num_refs = delayed_refs->num_entries;
+ first_seq = root->fs_info->tree_mod_seq_list.next;
} else {
- wait_for_more_refs(delayed_refs, num_refs);
+ wait_for_more_refs(delayed_refs,
+ num_refs, first_seq);
/*
* after waiting, things have changed. we
* dropped the lock and someone else might have
@@ -3578,7 +3579,7 @@ again:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
out:
- mutex_unlock(&extent_root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -4355,10 +4356,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
BTRFS_I(inode)->outstanding_extents--;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
- BTRFS_I(inode)->delalloc_meta_reserved) {
+ test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
drop_inode_space = 1;
- BTRFS_I(inode)->delalloc_meta_reserved = 0;
- }
/*
* If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4465,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
* Add an item to reserve for updating the inode when we complete the
* delalloc io.
*/
- if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+ if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
nr_extents++;
extra_reserve = 1;
}
@@ -4511,7 +4512,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
if (extra_reserve) {
- BTRFS_I(inode)->delalloc_meta_reserved = 1;
+ set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags);
nr_extents--;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5219,7 @@ out:
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref, int for_cow)
+ u64 parent, int last_ref)
{
struct btrfs_block_group_cache *cache = NULL;
int ret;
@@ -5227,7 +5229,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BTRFS_DROP_DELAYED_REF, NULL, 0);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6249,7 +6251,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size, int for_cow)
+ u64 hint, u64 empty_size)
{
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6299,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
ins.objectid,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op, for_cow);
+ extent_op, 0);
BUG_ON(ret); /* -ENOMEM */
}
return buf;
@@ -6715,7 +6717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a05036..01c21b6c6d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
#include "volumes.h"
#include "check-integrity.h"
#include "locking.h"
+#include "rcu-string.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -186,7 +187,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
return parent;
}
- entry = rb_entry(node, struct tree_entry, rb_node);
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
@@ -413,7 +413,7 @@ static struct extent_state *next_state(struct extent_state *state)
/*
* utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1)
+ * it will optionally wake up any one waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
@@ -570,10 +570,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- clear_state_bit(tree, state, &bits, wake);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
+ state = clear_state_bit(tree, state, &bits, wake);
+ goto next;
}
goto search_again;
}
@@ -781,7 +779,6 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
@@ -789,20 +786,15 @@ hit_next:
}
set_state_bits(tree, state, &bits);
-
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -845,6 +837,10 @@ hit_next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -994,21 +990,14 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- struct rb_node *next_node;
-
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
-
start = last_end + 1;
- next_node = rb_next(&state->rb_node);
- if (next_node && start < end && prealloc && !need_resched()) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
goto search_again;
}
@@ -1042,10 +1031,13 @@ hit_next:
goto out;
if (state->end <= end) {
set_state_bits(tree, state, &bits);
- clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
}
goto search_again;
}
@@ -1173,9 +1165,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
cached_state, mask);
}
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state,
- gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
cached_state, mask);
@@ -1293,7 +1284,7 @@ out:
* returned if we find something, and *start_ret and *end_ret are
* set to reflect the state struct that was found.
*
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,12 +1914,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
/* try to remap that extent elsewhere? */
bio_put(bio);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
- "sector %llu)\n", page->mapping->host->i_ino, start,
- dev->name, sector);
+ printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+ "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+ start, rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
@@ -2222,17 +2214,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
uptodate = 0;
}
- if (!uptodate && tree->ops &&
- tree->ops->writepage_io_failed_hook) {
- ret = tree->ops->writepage_io_failed_hook(NULL, page,
- start, end, NULL);
- /* Writeback already completed */
- if (ret == 0)
- return 1;
- }
-
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
@@ -2347,10 +2329,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
state, mirror);
- if (ret)
+ if (ret) {
+ /* no IO indicated but software detected errors
+ * in the block, either checksum errors or
+ * issues with the contents */
+ struct btrfs_root *root =
+ BTRFS_I(page->mapping->host)->root;
+ struct btrfs_device *device;
+
uptodate = 0;
- else
+ device = btrfs_find_device_for_logical(
+ root, start, mirror);
+ if (device)
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ } else {
clean_io_failure(start, page);
+ }
}
if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3159,7 @@ static int write_one_eb(struct extent_buffer *eb,
u64 offset = eb->start;
unsigned long i, num_pages;
int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
- int ret;
+ int ret = 0;
clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
@@ -3329,6 +3324,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
+ struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -3339,6 +3335,18 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
int scanned = 0;
int tag;
+ /*
+ * We have to hold onto the inode so that ordered extents can do their
+ * work when the IO finishes. The alternative to this is failing to add
+ * an ordered extent if the igrab() fails there and that is a huge pain
+ * to deal with, so instead just hold onto the inode throughout the
+ * writepages operation. If it fails here we are freeing up the inode
+ * anyway and we'd rather not waste our time writing out stuff that is
+ * going to be truncated anyway.
+ */
+ if (!igrab(inode))
+ return 0;
+
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
@@ -3433,6 +3441,7 @@ retry:
index = 0;
goto retry;
}
+ btrfs_add_delayed_iput(inode);
return ret;
}
@@ -3930,6 +3939,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
eb->start = start;
eb->len = len;
eb->tree = tree;
+ eb->bflags = 0;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3977,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return eb;
}
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+ unsigned long i;
+ struct page *p;
+ struct extent_buffer *new;
+ unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+ new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+ if (new == NULL)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ p = alloc_page(GFP_ATOMIC);
+ BUG_ON(!p);
+ attach_extent_buffer_page(new, p);
+ WARN_ON(PageDirty(p));
+ SetPageUptodate(p);
+ new->pages[i] = p;
+ }
+
+ copy_extent_buffer(new, src, 0, 0, src->len);
+ set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+ set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+ return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+ struct extent_buffer *eb;
+ unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long i;
+
+ eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+ if (!eb)
+ return NULL;
+
+ for (i = 0; i < num_pages; i++) {
+ eb->pages[i] = alloc_page(GFP_ATOMIC);
+ if (!eb->pages[i])
+ goto err;
+ }
+ set_extent_buffer_uptodate(eb);
+ btrfs_set_header_nritems(eb, 0);
+ set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+ return eb;
+err:
+ for (i--; i > 0; i--)
+ __free_page(eb->pages[i]);
+ __free_extent_buffer(eb);
+ return NULL;
+}
+
static int extent_buffer_under_io(struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4045,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long start_idx)
{
unsigned long index;
+ unsigned long num_pages;
struct page *page;
+ int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
BUG_ON(extent_buffer_under_io(eb));
- index = num_extent_pages(eb->start, eb->len);
+ num_pages = num_extent_pages(eb->start, eb->len);
+ index = start_idx + num_pages;
if (start_idx >= index)
return;
do {
index--;
page = extent_buffer_page(eb, index);
- if (page) {
+ if (page && mapped) {
spin_lock(&page->mapping->private_lock);
/*
* We do this since we'll remove the pages after we've
@@ -4017,6 +4084,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
}
spin_unlock(&page->mapping->private_lock);
+ }
+ if (page) {
/* One for when we alloced the page */
page_cache_release(page);
}
@@ -4235,14 +4304,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
{
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
- struct extent_io_tree *tree = eb->tree;
+ if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ } else {
+ struct extent_io_tree *tree = eb->tree;
- spin_unlock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
- spin_lock(&tree->buffer_lock);
- radix_tree_delete(&tree->buffer,
- eb->start >> PAGE_CACHE_SHIFT);
- spin_unlock(&tree->buffer_lock);
+ spin_lock(&tree->buffer_lock);
+ radix_tree_delete(&tree->buffer,
+ eb->start >> PAGE_CACHE_SHIFT);
+ spin_unlock(&tree->buffer_lock);
+ }
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4333,10 @@ void free_extent_buffer(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
+ test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+ atomic_dec(&eb->refs);
+
+ if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b8dec..25900af5b15 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
#define EXTENT_BUFFER_STALE 6
#define EXTENT_BUFFER_WRITEBACK 7
#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
unsigned long bio_flags);
int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
- int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
- u64 start, u64 end,
- struct extent_state *state);
int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int mirror);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53bf2d764bb..9aa01ec2138 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
int cycled;
};
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+ struct inode_defrag *defrag2)
+{
+ if (defrag1->root > defrag2->root)
+ return 1;
+ else if (defrag1->root < defrag2->root)
+ return -1;
+ else if (defrag1->ino > defrag2->ino)
+ return 1;
+ else if (defrag1->ino < defrag2->ino)
+ return -1;
+ else
+ return 0;
+}
+
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
+ int ret;
p = &root->fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (defrag->ino < entry->ino)
+ ret = __compare_inode_defrag(defrag, entry);
+ if (ret < 0)
p = &parent->rb_left;
- else if (defrag->ino > entry->ino)
+ else if (ret > 0)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
goto exists;
}
}
- BTRFS_I(inode)->in_defrag = 1;
+ set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (btrfs_fs_closing(root->fs_info))
return 0;
- if (BTRFS_I(inode)->in_defrag)
+ if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
return 0;
if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!BTRFS_I(inode)->in_defrag)
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
__btrfs_add_inode_defrag(inode, defrag);
else
kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
/*
* must be called with the defrag_inodes lock held
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+ u64 root, u64 ino,
struct rb_node **next)
{
struct inode_defrag *entry = NULL;
+ struct inode_defrag tmp;
struct rb_node *p;
struct rb_node *parent = NULL;
+ int ret;
+
+ tmp.ino = ino;
+ tmp.root = root;
p = info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- if (ino < entry->ino)
+ ret = __compare_inode_defrag(&tmp, entry);
+ if (ret < 0)
p = parent->rb_left;
- else if (ino > entry->ino)
+ else if (ret > 0)
p = parent->rb_right;
else
return entry;
}
if (next) {
- while (parent && ino > entry->ino) {
+ while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
entry = rb_entry(parent, struct inode_defrag, rb_node);
}
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
u64 first_ino = 0;
+ u64 root_objectid = 0;
int num_defrag;
int defrag_batch = 1024;
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
n = NULL;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+ defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+ first_ino, &n);
if (!defrag) {
- if (n)
- defrag = rb_entry(n, struct inode_defrag, rb_node);
- else if (first_ino) {
+ if (n) {
+ defrag = rb_entry(n, struct inode_defrag,
+ rb_node);
+ } else if (root_objectid || first_ino) {
+ root_objectid = 0;
first_ino = 0;
continue;
} else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
/* remove it from the rbtree */
first_ino = defrag->ino + 1;
+ root_objectid = defrag->root;
rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
goto next;
/* do a chunk of defrag */
- BTRFS_I(inode)->in_defrag = 0;
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
range.start = defrag->last_offset;
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
defrag_batch);
@@ -1305,7 +1334,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t *ppos, size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = fdentry(file)->d_inode;
struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
@@ -1315,18 +1343,6 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
count, ocount);
- /*
- * the generic O_DIRECT will update in-memory i_size after the
- * DIOs are done. But our endio handlers that update the on
- * disk i_size never update past the in memory i_size. So we
- * need one more update here to catch any additions to the
- * file
- */
- if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- mark_inode_dirty(inode);
- }
-
if (written < 0 || written == count)
return written;
@@ -1404,12 +1420,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
- err = btrfs_update_time(file);
+ err = file_update_time(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
}
- BTRFS_I(inode)->sequence++;
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1481,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
- if (BTRFS_I(inode)->ordered_data_close) {
- BTRFS_I(inode)->ordered_data_close = 0;
+ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags)) {
btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
@@ -1498,14 +1513,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_btrfs_sync_file(file, datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
mutex_lock(&inode->i_mutex);
- /* we wait first, since the writeback may change the inode */
+ /*
+ * we wait first, since the writeback may change the inode, also wait
+ * ordered range does a filemape_write_and_wait_range which is why we
+ * don't do it above like other file systems.
+ */
root->log_batch++;
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ btrfs_wait_ordered_range(inode, start, end);
root->log_batch++;
/*
@@ -1523,7 +1539,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* syncing
*/
smp_mb();
- if (BTRFS_I(inode)->last_trans <=
+ if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 202008ec367..6c4e2baa929 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
return ERR_PTR(-ENOENT);
}
- inode->i_mapping->flags &= ~__GFP_FS;
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return inode;
}
@@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *val;
+ __le64 *val;
io_ctl_map_page(io_ctl, 1);
@@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
{
- u64 *gen;
+ __le64 *gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
return 0;
}
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries. This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache. So run through the space cache that we just
+ * loaded and merge contiguous entries. This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *e, *prev = NULL;
+ struct rb_node *n;
+
+again:
+ spin_lock(&ctl->tree_lock);
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+ e = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!prev)
+ goto next;
+ if (e->bitmap || prev->bitmap)
+ goto next;
+ if (prev->offset + prev->bytes == e->offset) {
+ unlink_free_space(ctl, prev);
+ unlink_free_space(ctl, e);
+ prev->bytes += e->bytes;
+ kmem_cache_free(btrfs_free_space_cachep, e);
+ link_free_space(ctl, prev);
+ prev = NULL;
+ spin_unlock(&ctl->tree_lock);
+ goto again;
+ }
+next:
+ prev = e;
+ }
+ spin_unlock(&ctl->tree_lock);
+}
+
int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
@@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
io_ctl_drop_pages(&io_ctl);
+ merge_space_tree(ctl);
ret = 1;
out:
io_ctl_free(&io_ctl);
@@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret)
- goto out;
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
@@ -1503,29 +1543,26 @@ again:
end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
/*
- * XXX - this can go away after a few releases.
- *
- * since the only user of btrfs_remove_free_space is the tree logging
- * stuff, and the only way to test that is under crash conditions, we
- * want to have this debug stuff here just in case somethings not
- * working. Search the bitmap for the space we are trying to use to
- * make sure its actually there. If its not there then we need to stop
- * because something has gone wrong.
+ * We need to search for bits in this bitmap. We could only cover some
+ * of the extent in this bitmap thanks to how we add space, so we need
+ * to search for as much as it as we can and clear that amount, and then
+ * go searching for the next bit.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
BUG_ON(ret < 0 || search_start != *offset);
- if (*offset > bitmap_info->offset && *offset + *bytes > end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
- *bytes -= end - *offset + 1;
- *offset = end + 1;
- } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
- bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
- *bytes = 0;
- }
+ /* We may have found more bits than what we need */
+ search_bytes = min(search_bytes, *bytes);
+
+ /* Cannot clear past the end of the bitmap */
+ search_bytes = min(search_bytes, end - search_start + 1);
+
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ *offset += search_bytes;
+ *bytes -= search_bytes;
if (*bytes) {
struct rb_node *next = rb_next(&bitmap_info->offset_index);
@@ -1556,7 +1593,7 @@ again:
* everything over again.
*/
search_start = *offset;
- search_bytes = *bytes;
+ search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes);
if (ret < 0 || search_start != *offset)
@@ -1839,12 +1876,14 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- struct btrfs_free_space *next_info = NULL;
int ret = 0;
spin_lock(&ctl->tree_lock);
again:
+ if (!bytes)
+ goto out_lock;
+
info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
/*
@@ -1865,88 +1904,48 @@ again:
}
}
- if (info->bytes < bytes && rb_next(&info->offset_index)) {
- u64 end;
- next_info = rb_entry(rb_next(&info->offset_index),
- struct btrfs_free_space,
- offset_index);
-
- if (next_info->bitmap)
- end = next_info->offset +
- BITS_PER_BITMAP * ctl->unit - 1;
- else
- end = next_info->offset + next_info->bytes;
-
- if (next_info->bytes < bytes ||
- next_info->offset > offset || offset > end) {
- printk(KERN_CRIT "Found free space at %llu, size %llu,"
- " trying to use %llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)bytes);
- WARN_ON(1);
- ret = -EINVAL;
- goto out_lock;
- }
-
- info = next_info;
- }
-
- if (info->bytes == bytes) {
- unlink_free_space(ctl, info);
- if (info->bitmap) {
- kfree(info->bitmap);
- ctl->total_bitmaps--;
- }
- kmem_cache_free(btrfs_free_space_cachep, info);
- ret = 0;
- goto out_lock;
- }
-
- if (!info->bitmap && info->offset == offset) {
+ if (!info->bitmap) {
unlink_free_space(ctl, info);
- info->offset += bytes;
- info->bytes -= bytes;
- ret = link_free_space(ctl, info);
- WARN_ON(ret);
- goto out_lock;
- }
+ if (offset == info->offset) {
+ u64 to_free = min(bytes, info->bytes);
+
+ info->bytes -= to_free;
+ info->offset += to_free;
+ if (info->bytes) {
+ ret = link_free_space(ctl, info);
+ WARN_ON(ret);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ }
- if (!info->bitmap && info->offset <= offset &&
- info->offset + info->bytes >= offset + bytes) {
- u64 old_start = info->offset;
- /*
- * we're freeing space in the middle of the info,
- * this can happen during tree log replay
- *
- * first unlink the old info and then
- * insert it again after the hole we're creating
- */
- unlink_free_space(ctl, info);
- if (offset + bytes < info->offset + info->bytes) {
- u64 old_end = info->offset + info->bytes;
+ offset += to_free;
+ bytes -= to_free;
+ goto again;
+ } else {
+ u64 old_end = info->bytes + info->offset;
- info->offset = offset + bytes;
- info->bytes = old_end - info->offset;
+ info->bytes = offset - info->offset;
ret = link_free_space(ctl, info);
WARN_ON(ret);
if (ret)
goto out_lock;
- } else {
- /* the hole we're creating ends at the end
- * of the info struct, just free the info
- */
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
- spin_unlock(&ctl->tree_lock);
- /* step two, insert a new info struct to cover
- * anything before the hole
- */
- ret = btrfs_add_free_space(block_group, old_start,
- offset - old_start);
- WARN_ON(ret); /* -ENOMEM */
- goto out;
+ /* Not enough bytes in this entry to satisfy us */
+ if (old_end < offset + bytes) {
+ bytes -= old_end - offset;
+ offset = old_end;
+ goto again;
+ } else if (old_end == offset + bytes) {
+ /* all done */
+ goto out_lock;
+ }
+ spin_unlock(&ctl->tree_lock);
+
+ ret = btrfs_add_free_space(block_group, offset + bytes,
+ old_end - (offset + bytes));
+ WARN_ON(ret);
+ goto out;
+ }
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0df0d1fd4fe..a7d1921ac76 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
static int btrfs_setsize(struct inode *inode, loff_t newsize);
static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -830,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode,
if (IS_ERR(trans)) {
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
- start, end, NULL,
+ start, end, locked_page,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
@@ -963,7 +963,7 @@ out:
out_unlock:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
- start, end, NULL,
+ start, end, locked_page,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
@@ -986,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work)
compress_file_range(async_cow->inode, async_cow->locked_page,
async_cow->start, async_cow->end, async_cow,
&num_added);
- if (num_added == 0)
+ if (num_added == 0) {
+ btrfs_add_delayed_iput(async_cow->inode);
async_cow->inode = NULL;
+ }
}
/*
@@ -1020,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_cow *async_cow;
async_cow = container_of(work, struct async_cow, work);
+ if (async_cow->inode)
+ btrfs_add_delayed_iput(async_cow->inode);
kfree(async_cow);
}
@@ -1038,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = inode;
+ async_cow->inode = igrab(inode);
async_cow->root = root;
async_cow->locked_page = locked_page;
async_cow->start = start;
@@ -1136,8 +1140,18 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
return -ENOMEM;
+ }
nolock = btrfs_is_free_space_inode(root, inode);
@@ -1147,6 +1161,15 @@ static noinline int run_delalloc_nocow(struct inode *inode,
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ start, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
btrfs_free_path(path);
return PTR_ERR(trans);
}
@@ -1327,8 +1350,11 @@ out_check:
}
btrfs_release_path(path);
- if (cur_offset <= end && cow_start == (u64)-1)
+ if (cur_offset <= end && cow_start == (u64)-1) {
cow_start = cur_offset;
+ cur_offset = end;
+ }
+
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start, end,
page_started, nr_written, 1);
@@ -1347,6 +1373,17 @@ error:
if (!ret)
ret = err;
+ if (ret && cur_offset < end)
+ extent_clear_unlock_delalloc(inode,
+ &BTRFS_I(inode)->io_tree,
+ cur_offset, end, locked_page,
+ EXTENT_CLEAR_UNLOCK_PAGE |
+ EXTENT_CLEAR_UNLOCK |
+ EXTENT_CLEAR_DELALLOC |
+ EXTENT_CLEAR_DIRTY |
+ EXTENT_SET_WRITEBACK |
+ EXTENT_END_WRITEBACK);
+
btrfs_free_path(path);
return ret;
}
@@ -1361,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
- else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+ } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
+ } else if (!btrfs_test_opt(root, COMPRESS) &&
+ !(BTRFS_I(inode)->force_compress) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
- else
+ } else {
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags);
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
+ }
return ret;
}
@@ -1575,11 +1615,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (btrfs_is_free_space_inode(root, inode))
metadata = 2;
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
- if (ret)
- return ret;
-
if (!(rw & REQ_WRITE)) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ if (ret)
+ return ret;
+
if (bio_flags & EXTENT_BIO_COMPRESSED) {
return btrfs_submit_compressed_read(inode, bio,
mirror_num, bio_flags);
@@ -1818,25 +1858,24 @@ out:
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
+ struct inode *inode = ordered_extent->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
int compress_type = 0;
int ret;
bool nolock;
- ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1);
- if (!ret)
- return 0;
- BUG_ON(!ordered_extent); /* Logic error */
-
nolock = btrfs_is_free_space_inode(root, inode);
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ ret = -EIO;
+ goto out;
+ }
+
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1892,12 +1931,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->file_offset,
ordered_extent->len);
}
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ goto out_unlock;
}
add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1908,10 +1945,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ goto out_unlock;
}
}
ret = 0;
+out_unlock:
+ unlock_extent_cached(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
if (root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1922,26 +1963,57 @@ out:
btrfs_end_transaction(trans, root);
}
+ if (ret)
+ clear_extent_uptodate(io_tree, ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1, NULL, GFP_NOFS);
+
+ /*
+ * This needs to be dont to make sure anybody waiting knows we are done
+ * upating everything for this ordered extent.
+ */
+ btrfs_remove_ordered_extent(inode, ordered_extent);
+
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
- return 0;
-out_unlock:
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
- goto out;
+ return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
}
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
+ struct btrfs_workers *workers;
+
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
ClearPagePrivate2(page);
- return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+ end - start + 1, uptodate))
+ return 0;
+
+ ordered_extent->work.func = finish_ordered_fn;
+ ordered_extent->work.flags = 0;
+
+ if (btrfs_is_free_space_inode(root, inode))
+ workers = &root->fs_info->endio_freespace_worker;
+ else
+ workers = &root->fs_info->endio_write_workers;
+ btrfs_queue_worker(workers, &ordered_extent->work);
+
+ return 0;
}
/*
@@ -2075,12 +2147,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_block_rsv *block_rsv;
int ret;
- if (!list_empty(&root->orphan_list) ||
+ if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
spin_lock(&root->orphan_lock);
- if (!list_empty(&root->orphan_list)) {
+ if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2137,8 +2209,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
block_rsv = NULL;
}
- if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
#if 0
/*
* For proper ENOSPC handling, we should do orphan
@@ -2151,12 +2223,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
insert = 1;
#endif
insert = 1;
+ atomic_dec(&root->orphan_inodes);
}
- if (!BTRFS_I(inode)->orphan_meta_reserved) {
- BTRFS_I(inode)->orphan_meta_reserved = 1;
+ if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
reserve = 1;
- }
spin_unlock(&root->orphan_lock);
/* grab metadata reservation from transaction handle */
@@ -2169,6 +2241,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+ clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -2199,15 +2273,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
int ret = 0;
spin_lock(&root->orphan_lock);
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- list_del_init(&BTRFS_I(inode)->i_orphan);
+ if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags))
delete_item = 1;
- }
- if (BTRFS_I(inode)->orphan_meta_reserved) {
- BTRFS_I(inode)->orphan_meta_reserved = 0;
+ if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
release_rsv = 1;
- }
spin_unlock(&root->orphan_lock);
if (trans && delete_item) {
@@ -2215,8 +2287,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
}
- if (release_rsv)
+ if (release_rsv) {
btrfs_orphan_release_metadata(inode);
+ atomic_dec(&root->orphan_inodes);
+ }
return 0;
}
@@ -2344,6 +2418,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = PTR_ERR(trans);
goto out;
}
+ printk(KERN_ERR "auto deleting %Lu\n",
+ found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2355,9 +2431,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* add this inode to the orphan list so btrfs_orphan_del does
* the proper thing when we hit it
*/
- spin_lock(&root->orphan_lock);
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->orphan_lock);
+ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags);
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
@@ -2513,7 +2588,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_version = btrfs_inode_sequence(leaf, inode_item);
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2597,7 +2672,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
- btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+ btrfs_set_inode_sequence(leaf, item, inode->i_version);
btrfs_set_inode_transid(leaf, item, trans->transid);
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2755,6 +2830,8 @@ err:
goto out;
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(inode);
+ inode_inc_iversion(dir);
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
btrfs_update_inode(trans, root, dir);
out:
@@ -3092,6 +3169,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
if (ret)
@@ -3610,7 +3688,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
* any new writes get down to disk quickly.
*/
if (newsize == 0)
- BTRFS_I(inode)->ordered_data_close = 1;
+ set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags);
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
@@ -3641,6 +3720,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid) {
setattr_copy(inode, attr);
+ inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
@@ -3674,7 +3754,8 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (root->fs_info->log_root_recovering) {
- BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+ BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags));
goto no_delete;
}
@@ -3759,7 +3840,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
no_delete:
- end_writeback(inode);
+ clear_inode(inode);
return;
}
@@ -4069,7 +4150,7 @@ static struct inode *new_simple_dir(struct super_block *s,
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
- BTRFS_I(inode)->dummy_inode = 1;
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4373,7 +4454,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
int ret = 0;
bool nolock = false;
- if (BTRFS_I(inode)->dummy_inode)
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4406,7 +4487,7 @@ int btrfs_dirty_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
int ret;
- if (BTRFS_I(inode)->dummy_inode)
+ if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
trans = btrfs_join_transaction(root);
@@ -4434,46 +4515,18 @@ int btrfs_dirty_inode(struct inode *inode)
* This is a copy of file_update_time. We need this so we can return error on
* ENOSPC for updating the inode in the case of file write and mmap writes.
*/
-int btrfs_update_time(struct file *file)
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+ int flags)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- struct timespec now;
- int ret;
- enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
-
- /* First try to exhaust all avenues to not sync */
- if (IS_NOCMTIME(inode))
- return 0;
-
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_mtime, &now))
- sync_it = S_MTIME;
-
- if (!timespec_equal(&inode->i_ctime, &now))
- sync_it |= S_CTIME;
-
- if (IS_I_VERSION(inode))
- sync_it |= S_VERSION;
-
- if (!sync_it)
- return 0;
-
- /* Finally allowed to write? Takes lock. */
- if (mnt_want_write_file(file))
- return 0;
-
- /* Only change inode inside the lock region */
- if (sync_it & S_VERSION)
+ if (flags & S_VERSION)
inode_inc_iversion(inode);
- if (sync_it & S_CTIME)
- inode->i_ctime = now;
- if (sync_it & S_MTIME)
- inode->i_mtime = now;
- ret = btrfs_dirty_inode(inode);
- if (!ret)
- mark_inode_dirty_sync(inode);
- mnt_drop_write(file->f_path.mnt);
- return ret;
+ if (flags & S_CTIME)
+ inode->i_ctime = *now;
+ if (flags & S_MTIME)
+ inode->i_mtime = *now;
+ if (flags & S_ATIME)
+ inode->i_atime = *now;
+ return btrfs_dirty_inode(inode);
}
/*
@@ -4733,6 +4786,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
+ inode_inc_iversion(parent_inode);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
@@ -4940,6 +4994,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
btrfs_inc_nlink(inode);
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -5821,8 +5876,17 @@ map:
bh_result->b_size = len;
bh_result->b_bdev = em->bdev;
set_buffer_mapped(bh_result);
- if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
+ if (create) {
+ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+ }
free_extent_map(em);
@@ -5906,9 +5970,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
struct btrfs_ordered_extent *ordered = NULL;
- struct extent_state *cached_state = NULL;
u64 ordered_offset = dip->logical_offset;
u64 ordered_bytes = dip->bytes;
int ret;
@@ -5918,73 +5980,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
- ordered_bytes);
+ ordered_bytes, !err);
if (!ret)
goto out_test;
- BUG_ON(!ordered);
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- err = -ENOMEM;
- goto out;
- }
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = btrfs_ordered_update_i_size(inode, 0, ordered);
- if (!ret)
- err = btrfs_update_inode_fallback(trans, root, inode);
- goto out;
- }
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
- ordered->file_offset + ordered->len - 1, 0,
- &cached_state);
-
- if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
- ret = btrfs_mark_extent_written(trans, inode,
- ordered->file_offset,
- ordered->file_offset +
- ordered->len);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
- } else {
- ret = insert_reserved_file_extent(trans, inode,
- ordered->file_offset,
- ordered->start,
- ordered->disk_len,
- ordered->len,
- ordered->len,
- 0, 0, 0,
- BTRFS_FILE_EXTENT_REG);
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered->file_offset, ordered->len);
- if (ret) {
- err = ret;
- WARN_ON(1);
- goto out_unlock;
- }
- }
-
- add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
- ret = btrfs_ordered_update_i_size(inode, 0, ordered);
- if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
- btrfs_update_inode_fallback(trans, root, inode);
- ret = 0;
-out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
- ordered->file_offset + ordered->len - 1,
- &cached_state, GFP_NOFS);
-out:
- btrfs_delalloc_release_metadata(inode, ordered->len);
- btrfs_end_transaction(trans, root);
- ordered_offset = ordered->file_offset + ordered->len;
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
-
+ ordered->work.func = finish_ordered_fn;
+ ordered->work.flags = 0;
+ btrfs_queue_worker(&root->fs_info->endio_write_workers,
+ &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -5993,12 +5996,12 @@ out_test:
if (ordered_offset < dip->logical_offset + dip->bytes) {
ordered_bytes = dip->logical_offset + dip->bytes -
ordered_offset;
+ ordered = NULL;
goto again;
}
out_done:
bio->bi_private = dip->private;
- kfree(dip->csums);
kfree(dip);
/* If we had an error make sure to clear the uptodate flag */
@@ -6066,9 +6069,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int ret;
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
- if (ret)
- goto err;
+
+ if (!write) {
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto err;
+ }
if (skip_sum)
goto map;
@@ -6363,12 +6369,48 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
lockend - lockstart + 1);
- if (!ordered)
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered && (!writing ||
+ !test_range_bit(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, EXTENT_UPTODATE, 0,
+ cached_state)))
break;
+
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
+
+ if (ordered) {
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /* Screw you mmap */
+ ret = filemap_write_and_wait_range(file->f_mapping,
+ lockstart,
+ lockend);
+ if (ret)
+ goto out;
+
+ /*
+ * If we found a page that couldn't be invalidated just
+ * fall back to buffered.
+ */
+ ret = invalidate_inode_pages2_range(file->f_mapping,
+ lockstart >> PAGE_CACHE_SHIFT,
+ lockend >> PAGE_CACHE_SHIFT);
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = 0;
+ goto out;
+ }
+ }
+
cond_resched();
}
@@ -6488,13 +6530,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
static void btrfs_invalidatepage(struct page *page, unsigned long offset)
{
+ struct inode *inode = page->mapping->host;
struct extent_io_tree *tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
/*
* we have the page locked, so new writeback can't start,
* and the dirty bit won't be cleared while we are here.
@@ -6504,13 +6546,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(page->mapping->host)->io_tree;
+ tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+ ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
if (ordered) {
/*
@@ -6525,9 +6567,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
* whoever cleared the private bit is responsible
* for the finish_ordered_io
*/
- if (TestClearPagePrivate2(page)) {
- btrfs_finish_ordered_io(page->mapping->host,
- page_start, page_end);
+ if (TestClearPagePrivate2(page) &&
+ btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
+ PAGE_CACHE_SIZE, 1)) {
+ btrfs_finish_ordered_io(ordered);
}
btrfs_put_ordered_extent(ordered);
cached_state = NULL;
@@ -6579,7 +6622,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (!ret) {
- ret = btrfs_update_time(vma->vm_file);
+ ret = file_update_time(vma->vm_file);
reserved = 1;
}
if (ret) {
@@ -6774,7 +6817,8 @@ static int btrfs_truncate(struct inode *inode)
* using truncate to replace the contents of the file will
* end up with a zero length file after a crash.
*/
- if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ &BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
@@ -6897,7 +6941,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->root = NULL;
ei->space_info = NULL;
ei->generation = 0;
- ei->sequence = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
@@ -6912,11 +6955,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->outstanding_extents = 0;
ei->reserved_extents = 0;
- ei->ordered_data_close = 0;
- ei->orphan_meta_reserved = 0;
- ei->dummy_inode = 0;
- ei->in_defrag = 0;
- ei->delalloc_meta_reserved = 0;
+ ei->runtime_flags = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL;
@@ -6930,7 +6969,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->log_mutex);
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
- INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node);
@@ -6975,13 +7013,12 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- spin_lock(&root->orphan_lock);
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &BTRFS_I(inode)->runtime_flags)) {
printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
(unsigned long long)btrfs_ino(inode));
- list_del_init(&BTRFS_I(inode)->i_orphan);
+ atomic_dec(&root->orphan_inodes);
}
- spin_unlock(&root->orphan_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7102,10 +7139,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode)
else
b_inode->flags &= ~BTRFS_INODE_NODATACOW;
- if (b_dir->flags & BTRFS_INODE_COMPRESS)
+ if (b_dir->flags & BTRFS_INODE_COMPRESS) {
b_inode->flags |= BTRFS_INODE_COMPRESS;
- else
- b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+ b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ } else {
+ b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
+ BTRFS_INODE_NOCOMPRESS);
+ }
}
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -7196,6 +7236,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
btrfs_add_ordered_operation(trans, root, old_inode);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
+ inode_inc_iversion(old_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
@@ -7222,6 +7265,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new_inode) {
+ inode_inc_iversion(new_inode);
new_inode->i_ctime = CURRENT_TIME;
if (unlikely(btrfs_ino(new_inode) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7493,6 +7537,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_offset += ins.offset;
*alloc_hint = ins.objectid + ins.offset;
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7650,6 +7695,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
@@ -7660,6 +7706,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
@@ -7673,6 +7720,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .update_time = btrfs_update_time,
};
const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14f8e1faa46..0e92e576300 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -52,6 +52,7 @@
#include "locking.h"
#include "inode-map.h"
#include "backref.h"
+#include "rcu-string.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -261,6 +262,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
btrfs_update_iflags(inode);
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
@@ -367,7 +369,7 @@ static noinline int create_subvol(struct btrfs_root *root,
return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0, 0);
+ 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -784,39 +786,57 @@ none:
return -ENOENT;
}
-/*
- * Validaty check of prev em and next em:
- * 1) no prev/next em
- * 2) prev/next em is an hole/inline extent
- */
-static int check_adjacent_extents(struct inode *inode, struct extent_map *em)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_map *prev = NULL, *next = NULL;
- int ret = 0;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em;
+ u64 len = PAGE_CACHE_SIZE;
+ /*
+ * hopefully we have this extent in the tree already, try without
+ * the full extent lock
+ */
read_lock(&em_tree->lock);
- prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1);
- next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1);
+ em = lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
- if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) &&
- (!next || next->block_start >= EXTENT_MAP_LAST_BYTE))
- ret = 1;
- free_extent_map(prev);
- free_extent_map(next);
+ if (!em) {
+ /* get the big lock and read metadata off disk */
+ lock_extent(io_tree, start, start + len - 1);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ unlock_extent(io_tree, start, start + len - 1);
+
+ if (IS_ERR(em))
+ return NULL;
+ }
+
+ return em;
+}
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+ struct extent_map *next;
+ bool ret = true;
+
+ /* this is the last extent */
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+ next = defrag_lookup_extent(inode, em->start + em->len);
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ ret = false;
+
+ free_extent_map(next);
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u64 len,
- int thresh, u64 *last_len, u64 *skip,
- u64 *defrag_end)
+static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+ u64 *last_len, u64 *skip, u64 *defrag_end)
{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
int ret = 1;
+ bool next_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
@@ -827,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
*skip = 0;
- /*
- * hopefully we have this extent in the tree already, try without
- * the full extent lock
- */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- /* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1);
- em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1);
-
- if (IS_ERR(em))
- return 0;
- }
+ em = defrag_lookup_extent(inode, start);
+ if (!em)
+ return 0;
/* this will cover holes, and inline extents */
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
@@ -851,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
goto out;
}
- /* If we have nothing to merge with us, just skip. */
- if (check_adjacent_extents(inode, em)) {
- ret = 0;
- goto out;
- }
+ next_mergeable = defrag_check_next_extent(inode, em);
/*
- * we hit a real extent, if it is big don't bother defragging it again
+ * we hit a real extent, if it is big or the next extent is not a
+ * real extent, don't bother defragging it
*/
- if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+ if ((*last_len == 0 || *last_len >= thresh) &&
+ (em->len >= thresh || !next_mergeable))
ret = 0;
-
out:
/*
* last_len ends up being a counter of how many bytes we've defragged.
@@ -1141,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, extent_thresh,
- &last_len, &skip, &defrag_end)) {
+ extent_thresh, &last_len, &skip,
+ &defrag_end)) {
unsigned long next;
/*
* the should_defrag function tells us how much to skip
@@ -1303,6 +1306,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = -EINVAL;
goto out_free;
}
+ if (device->fs_devices && device->fs_devices->seeding) {
+ printk(KERN_INFO "btrfs: resizer unable to apply on "
+ "seeding device %llu\n",
+ (unsigned long long)devid);
+ ret = -EINVAL;
+ goto out_free;
+ }
+
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
else {
@@ -1344,8 +1355,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk(KERN_INFO "btrfs: new size for %s is %llu\n",
- device->name, (unsigned long long)new_size);
+ printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+ rcu_str_deref(device->name),
+ (unsigned long long)new_size);
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
@@ -2262,10 +2274,17 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
di_args->bytes_used = dev->bytes_used;
di_args->total_bytes = dev->total_bytes;
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name)
- strncpy(di_args->path, dev->name, sizeof(di_args->path));
- else
+ if (dev->name) {
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
+ strncpy(di_args->path, name->str, sizeof(di_args->path));
+ rcu_read_unlock();
+ di_args->path[sizeof(di_args->path) - 1] = 0;
+ } else {
di_args->path[0] = '\0';
+ }
out:
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2641,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
@@ -2914,7 +2934,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
- user_dest = (struct btrfs_ioctl_space_info *)
+ user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3062,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
return ret;
}
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+ void __user *arg, int reset_after_read)
+{
+ struct btrfs_ioctl_get_dev_stats *sa;
+ int ret;
+
+ if (reset_after_read && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
@@ -3212,8 +3254,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
}
}
-static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
+ struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
@@ -3225,6 +3268,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ ret = mnt_want_write(file->f_path.mnt);
+ if (ret)
+ return ret;
+
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -3291,6 +3338,7 @@ out_bargs:
out:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+ mnt_drop_write(file->f_path.mnt);
return ret;
}
@@ -3386,7 +3434,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(root, NULL);
+ return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3467,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
case BTRFS_IOC_BALANCE_V2:
- return btrfs_ioctl_balance(root, argp);
+ return btrfs_ioctl_balance(file, argp);
case BTRFS_IOC_BALANCE_CTL:
return btrfs_ioctl_balance_ctl(root, arg);
case BTRFS_IOC_BALANCE_PROGRESS:
return btrfs_ioctl_balance_progress(root, argp);
+ case BTRFS_IOC_GET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(root, argp, 0);
+ case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
+ return btrfs_ioctl_get_dev_stats(root, argp, 1);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 086e6bdae1c..e440aa653c3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
__u64 inodes;
};
+enum btrfs_dev_stat_values {
+ /* disk I/O failure stats */
+ BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
+ BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
+ BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
+
+ /* stats for indirect indications for I/O failures */
+ BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
+ * contents is illegal: this is an
+ * indication that the block was damaged
+ * during read or write, or written to
+ * wrong location or read from wrong
+ * location */
+ BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
+ * been written */
+
+ BTRFS_DEV_STAT_VALUES_MAX
+};
+
+struct btrfs_ioctl_get_dev_stats {
+ __u64 devid; /* in */
+ __u64 nr_items; /* in/out */
+
+ /* out values: */
+ __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
+
+ __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -310,7 +339,7 @@ struct btrfs_ioctl_logical_ino_args {
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
struct btrfs_ioctl_scrub_args)
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
+ struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+ struct btrfs_ioctl_get_dev_stats)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aeb..643335a4fe3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
- entry->inode = inode;
+ entry->inode = igrab(inode);
entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
trace_btrfs_ordered_extent_add(inode, entry);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
ordered_data_tree_panic(inode, -EEXIST, file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
}
/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
*/
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size)
+ u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
int ret;
+ unsigned long flags;
u64 dec_end;
u64 dec_start;
u64 to_dec;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
if (!node) {
ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
(unsigned long long)to_dec);
}
entry->bytes_left -= to_dec;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -332,7 +336,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -347,15 +351,21 @@ out:
*/
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size)
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
+ if (cached && *cached) {
+ entry = *cached;
+ goto have_entry;
+ }
+
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
}
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
if (!offset_in_entry(entry, file_offset)) {
ret = 1;
goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
(unsigned long long)io_size);
}
entry->bytes_left -= io_size;
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
@@ -383,7 +397,7 @@ out:
*cached = entry;
atomic_inc(&entry->refs);
}
- spin_unlock(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
}
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ if (entry->inode)
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* remove an ordered extent from the tree. No references are dropped
- * and you must wake_up entry->wait. You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
*/
-static void __btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+ struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ spin_unlock_irq(&tree->lock);
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
list_del_init(&BTRFS_I(inode)->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree. No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
-{
- struct btrfs_ordered_inode_tree *tree;
-
- tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
- __btrfs_remove_ordered_extent(inode, entry);
- spin_unlock(&tree->lock);
wake_up(&entry->wait);
}
@@ -621,17 +623,29 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (orig_end > INT_LIMIT(loff_t))
orig_end = INT_LIMIT(loff_t);
}
-again:
+
/* start IO across the range first to instantiate any delalloc
* extents
*/
filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
- /* The compression code will leave pages locked but return from
- * writepage without setting the page writeback. Starting again
- * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
*/
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
filemap_fdatawait_range(inode->i_mapping, start, orig_end);
@@ -657,11 +671,6 @@ again:
break;
end--;
}
- if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_DELALLOC, 0, NULL)) {
- schedule_timeout(1);
- goto again;
- }
}
/*
@@ -676,7 +685,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -687,7 +696,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (entry)
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -703,7 +712,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
@@ -728,7 +737,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
out:
if (entry)
atomic_inc(&entry->refs);
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -744,7 +753,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
struct btrfs_ordered_extent *entry = NULL;
tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -752,7 +761,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
atomic_inc(&entry->refs);
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
return entry;
}
@@ -764,7 +773,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered)
{
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 disk_i_size;
u64 new_i_size;
u64 i_size_test;
@@ -779,7 +787,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
disk_i_size = BTRFS_I(inode)->disk_i_size;
/* truncate file */
@@ -798,14 +806,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
/*
- * we can't update the disk_isize if there are delalloc bytes
- * between disk_i_size and this ordered extent
- */
- if (test_range_bit(io_tree, disk_i_size, offset - 1,
- EXTENT_DELALLOC, 0, NULL)) {
- goto out;
- }
- /*
* walk backward from this ordered extent to disk_i_size.
* if we find an ordered extent then we can't update disk i_size
* yet
@@ -825,15 +825,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
}
node = prev;
}
- while (node) {
+ for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ /* We treat this entry as if it doesnt exist */
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
if (test->file_offset + test->len <= disk_i_size)
break;
if (test->file_offset >= i_size)
break;
if (test->file_offset >= disk_i_size)
goto out;
- node = rb_prev(node);
}
new_i_size = min_t(u64, offset, i_size);
@@ -851,43 +854,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
else
node = rb_first(&tree->tree);
}
- i_size_test = 0;
- if (node) {
- /*
- * do we have an area where IO might have finished
- * between our ordered extent and the next one.
- */
+
+ /*
+ * We are looking for an area between our current extent and the next
+ * ordered extent to update the i_size to. There are 3 cases here
+ *
+ * 1) We don't actually have anything and we can update to i_size.
+ * 2) We have stuff but they already did their i_size update so again we
+ * can just update to i_size.
+ * 3) We have an outstanding ordered extent so the most we can update
+ * our disk_i_size to is the start of the next offset.
+ */
+ i_size_test = i_size;
+ for (; node; node = rb_next(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > offset)
+
+ if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+ continue;
+ if (test->file_offset > offset) {
i_size_test = test->file_offset;
- } else {
- i_size_test = i_size;
+ break;
+ }
}
/*
* i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents. As long as there
- * are no delalloc bytes in this area, it is safe to update
- * disk_i_size to the end of the region.
+ * extent where there are no ordered extents, we can safely set
+ * disk_i_size to this.
*/
- if (i_size_test > offset &&
- !test_range_bit(io_tree, offset, i_size_test - 1,
- EXTENT_DELALLOC, 0, NULL)) {
+ if (i_size_test > offset)
new_i_size = min_t(u64, i_size_test, i_size);
- }
BTRFS_I(inode)->disk_i_size = new_i_size;
ret = 0;
out:
/*
- * we need to remove the ordered extent with the tree lock held
- * so that other people calling this function don't find our fully
- * processed ordered entry and skip updating the i_size
+ * We need to do this because we can't remove ordered extents until
+ * after the i_disk_size has been updated and then the inode has been
+ * updated to reflect the change, so we need to tell anybody who finds
+ * this ordered extent that we've already done all the real work, we
+ * just haven't completed all the other work.
*/
if (ordered)
- __btrfs_remove_ordered_extent(inode, ordered);
- spin_unlock(&tree->lock);
- if (ordered)
- wake_up(&ordered->wait);
+ set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+ spin_unlock_irq(&tree->lock);
return ret;
}
@@ -912,7 +921,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
if (!ordered)
return 1;
- spin_lock(&tree->lock);
+ spin_lock_irq(&tree->lock);
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr) {
num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +936,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
}
}
out:
- spin_unlock(&tree->lock);
+ spin_unlock_irq(&tree->lock);
btrfs_put_ordered_extent(ordered);
return ret;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c355ad4dc1a..e03c560d299 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+ * has done its due diligence in updating
+ * the isize. */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
/* a per root list of all the pending ordered extents */
struct list_head root_extent_list;
+
+ struct btrfs_work work;
};
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size);
+ u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size);
+ u64 *file_offset, u64 io_size,
+ int uptodate);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f38e452486b..5e23684887e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_dev_extent_chunk_offset(l, dev_extent),
(unsigned long long)
btrfs_dev_extent_length(l, dev_extent));
+ case BTRFS_DEV_STATS_KEY:
+ printk(KERN_INFO "\t\tdevice stats\n");
+ break;
};
}
}
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
new file mode 100644
index 00000000000..9e111e4576d
--- /dev/null
+++ b/fs/btrfs/rcu-string.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct rcu_string {
+ struct rcu_head rcu;
+ char str[0];
+};
+
+static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
+{
+ size_t len = strlen(src) + 1;
+ struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
+ (len * sizeof(char)), mask);
+ if (!ret)
+ return ret;
+ strncpy(ret->str, src, len);
+ return ret;
+}
+
+static inline void rcu_string_free(struct rcu_string *str)
+{
+ if (str)
+ kfree_rcu(str, rcu);
+}
+
+#define printk_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define printk_ratelimited_in_rcu(fmt, ...) do { \
+ rcu_read_lock(); \
+ printk_ratelimited(fmt, __VA_ARGS__); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define rcu_str_deref(rcu_str) ({ \
+ struct rcu_string *__str = rcu_dereference(rcu_str); \
+ __str->str; \
+})
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d0108588..48a4882d8ad 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
struct btrfs_fs_info *fs_info;
+ int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
fs_info = rmw->fs_info;
kfree(rmw);
+ old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+ task_nice_ioprio(current));
+ set_task_ioprio(current, BTRFS_IOPRIO_READA);
__reada_start_machine(fs_info);
+ set_task_ioprio(current, old_ioprio);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2f3d6f917fb..b223620cd5a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -26,6 +26,7 @@
#include "backref.h"
#include "extent_io.h"
#include "check-integrity.h"
+#include "rcu-string.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -50,7 +51,7 @@ struct scrub_dev;
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
- struct block_device *bdev;
+ struct btrfs_device *dev;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
@@ -86,6 +87,7 @@ struct scrub_block {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
+ unsigned int generation_error:1; /* also sets header_error */
};
};
@@ -319,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
"length %llu, links %u (path: %s)\n", swarn->errstr,
- swarn->logical, swarn->dev->name,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
@@ -331,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
return 0;
err:
- printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
"resolving failed with ret=%d\n", swarn->errstr,
- swarn->logical, swarn->dev->name,
+ swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
free_ipath(ipath);
@@ -389,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
do {
ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
&ref_root, &ref_level);
- printk(KERN_WARNING
+ printk_in_rcu(KERN_WARNING
"btrfs: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical, dev->name,
+ "%llu\n", errstr, swarn.logical,
+ rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
@@ -579,9 +582,11 @@ out:
spin_lock(&sdev->stat_lock);
++sdev->stat.uncorrectable_errors;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
- (unsigned long long)fixup->logical, sdev->dev->name);
+ (unsigned long long)fixup->logical,
+ rcu_str_deref(sdev->dev->name));
}
btrfs_free_path(path);
@@ -675,6 +680,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
@@ -686,6 +693,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +708,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sdev->stat.read_errors++;
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
@@ -725,12 +736,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sdev->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("i/o error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sdev->stat_lock);
sdev->stat.csum_errors++;
spin_unlock(&sdev->stat_lock);
if (__ratelimit(&_rs))
scrub_print_warning("checksum error", sblock_to_check);
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
spin_lock(&sdev->stat_lock);
sdev->stat.verify_errors++;
@@ -738,6 +753,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (__ratelimit(&_rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
+ if (sblock_bad->generation_error)
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
if (sdev->readonly)
@@ -919,18 +940,20 @@ corrected_error:
spin_lock(&sdev->stat_lock);
sdev->stat.corrected_errors++;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: fixed up error at logical %llu on dev %s\n",
- (unsigned long long)logical, sdev->dev->name);
+ (unsigned long long)logical,
+ rcu_str_deref(sdev->dev->name));
}
} else {
did_not_correct_error:
spin_lock(&sdev->stat_lock);
sdev->stat.uncorrectable_errors++;
spin_unlock(&sdev->stat_lock);
- printk_ratelimited(KERN_ERR
+ printk_ratelimited_in_rcu(KERN_ERR
"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
- (unsigned long long)logical, sdev->dev->name);
+ (unsigned long long)logical,
+ rcu_str_deref(sdev->dev->name));
}
out:
@@ -998,8 +1021,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
page = sblock->pagev + page_index;
page->logical = logical;
page->physical = bbio->stripes[mirror_index].physical;
- /* for missing devices, bdev is NULL */
- page->bdev = bbio->stripes[mirror_index].dev->bdev;
+ /* for missing devices, dev->bdev is NULL */
+ page->dev = bbio->stripes[mirror_index].dev;
page->mirror_num = mirror_index + 1;
page->page = alloc_page(GFP_NOFS);
if (!page->page) {
@@ -1043,7 +1066,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_page *page = sblock->pagev + page_num;
DECLARE_COMPLETION_ONSTACK(complete);
- if (page->bdev == NULL) {
+ if (page->dev->bdev == NULL) {
page->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
@@ -1053,7 +1076,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
- bio->bi_bdev = page->bdev;
+ bio->bi_bdev = page->dev->bdev;
bio->bi_sector = page->physical >> 9;
bio->bi_end_io = scrub_complete_bio_end_io;
bio->bi_private = &complete;
@@ -1102,11 +1125,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
h = (struct btrfs_header *)mapped_buffer;
if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
- generation != le64_to_cpu(h->generation) ||
memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE))
+ BTRFS_UUID_SIZE)) {
+ sblock->header_error = 1;
+ } else if (generation != le64_to_cpu(h->generation)) {
sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
csum = h->csum;
} else {
if (!have_csum)
@@ -1182,7 +1208,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
- bio->bi_bdev = page_bad->bdev;
+ bio->bi_bdev = page_bad->dev->bdev;
bio->bi_sector = page_bad->physical >> 9;
bio->bi_end_io = scrub_complete_bio_end_io;
bio->bi_private = &complete;
@@ -1196,6 +1222,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
/* this will also unplug the queue */
wait_for_completion(&complete);
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
+ btrfs_dev_stat_inc_and_print(page_bad->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ bio_put(bio);
+ return -EIO;
+ }
bio_put(bio);
}
@@ -1352,7 +1384,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
+ int fail_gen = 0;
+ int fail_cor = 0;
u64 len;
int index;
@@ -1363,13 +1396,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
memcpy(on_disk_csum, s->csum, sdev->csum_size);
if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
- ++fail;
+ ++fail_cor;
if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
- ++fail;
+ ++fail_gen;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
- ++fail;
+ ++fail_cor;
len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1427,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
- ++fail;
+ ++fail_cor;
- if (fail) {
+ if (fail_cor + fail_gen) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
@@ -1405,9 +1438,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
spin_lock(&sdev->stat_lock);
++sdev->stat.super_errors;
spin_unlock(&sdev->stat_lock);
+ if (fail_cor)
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ else
+ btrfs_dev_stat_inc_and_print(sdev->dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS);
}
- return fail;
+ return fail_cor + fail_gen;
}
static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1590,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
return -ENOMEM;
}
spage->sblock = sblock;
- spage->bdev = sdev->dev->bdev;
+ spage->dev = sdev->dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5f8fca4195..e23991574fd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,7 @@
#include "version.h"
#include "export.h"
#include "compression.h"
+#include "rcu-string.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -188,7 +189,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_start(args, fmt);
if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
- strncpy(lvl, fmt, 3);
+ memcpy(lvl, fmt, 3);
+ lvl[3] = '\0';
fmt += 3;
type = logtypes[fmt[1] - '0'];
} else
@@ -435,11 +437,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_thread_pool:
intarg = 0;
match_int(&args[0], &intarg);
- if (intarg) {
+ if (intarg)
info->thread_pool_size = intarg;
- printk(KERN_INFO "btrfs: thread pool %d\n",
- info->thread_pool_size);
- }
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
@@ -769,7 +768,7 @@ static int btrfs_fill_super(struct super_block *sb,
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
-
+ sb->s_flags |= MS_I_VERSION;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
printk("btrfs: open_ctree failed\n");
@@ -925,63 +924,48 @@ static inline int is_subvolume_inode(struct inode *inode)
*/
static char *setup_root_args(char *args)
{
- unsigned copied = 0;
- unsigned len = strlen(args) + 2;
- char *pos;
- char *ret;
+ unsigned len = strlen(args) + 2 + 1;
+ char *src, *dst, *buf;
/*
- * We need the same args as before, but minus
- *
- * subvol=a
- *
- * and add
+ * We need the same args as before, but with this substitution:
+ * s!subvol=[^,]+!subvolid=0!
*
- * subvolid=0
- *
- * which is a difference of 2 characters, so we allocate strlen(args) +
- * 2 characters.
+ * Since the replacement string is up to 2 bytes longer than the
+ * original, allocate strlen(args) + 2 + 1 bytes.
*/
- ret = kzalloc(len * sizeof(char), GFP_NOFS);
- if (!ret)
- return NULL;
- pos = strstr(args, "subvol=");
+ src = strstr(args, "subvol=");
/* This shouldn't happen, but just in case.. */
- if (!pos) {
- kfree(ret);
+ if (!src)
+ return NULL;
+
+ buf = dst = kmalloc(len, GFP_NOFS);
+ if (!buf)
return NULL;
- }
/*
- * The subvol=<> arg is not at the front of the string, copy everybody
- * up to that into ret.
+ * If the subvol= arg is not at the start of the string,
+ * copy whatever precedes it into buf.
*/
- if (pos != args) {
- *pos = '\0';
- strcpy(ret, args);
- copied += strlen(args);
- pos++;
+ if (src != args) {
+ *src++ = '\0';
+ strcpy(buf, args);
+ dst += strlen(args);
}
- strncpy(ret + copied, "subvolid=0", len - copied);
-
- /* Length of subvolid=0 */
- copied += 10;
+ strcpy(dst, "subvolid=0");
+ dst += strlen("subvolid=0");
/*
- * If there is no , after the subvol= option then we know there's no
- * other options and we can just return.
+ * If there is a "," after the original subvol=... string,
+ * copy that suffix into our buffer. Otherwise, we're done.
*/
- pos = strchr(pos, ',');
- if (!pos)
- return ret;
+ src = strchr(src, ',');
+ if (src)
+ strcpy(dst, src);
- /* Copy the rest of the arguments into our buffer */
- strncpy(ret + copied, pos, len - copied);
- copied += strlen(pos);
-
- return ret;
+ return buf;
}
static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1102,40 @@ error_fs_info:
return ERR_PTR(error);
}
+static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
+{
+ spin_lock_irq(&workers->lock);
+ workers->max_workers = new_limit;
+ spin_unlock_irq(&workers->lock);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+ int new_pool_size, int old_pool_size)
+{
+ if (new_pool_size == old_pool_size)
+ return;
+
+ fs_info->thread_pool_size = new_pool_size;
+
+ printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+ old_pool_size, new_pool_size);
+
+ btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+ btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+ btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+}
+
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1155,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
+ btrfs_resize_thread_pool(fs_info,
+ fs_info->thread_pool_size, old_thread_pool_size);
+
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
@@ -1166,6 +1187,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto restore;
+
sb->s_flags &= ~MS_RDONLY;
}
@@ -1180,7 +1205,8 @@ restore:
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
fs_info->alloc_start = old_alloc_start;
- fs_info->thread_pool_size = old_thread_pool_size;
+ btrfs_resize_thread_pool(fs_info,
+ old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
return ret;
}
@@ -1461,12 +1487,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
"error %d\n", btrfs_ino(inode), ret);
}
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+ struct btrfs_fs_devices *cur_devices;
+ struct btrfs_device *dev, *first_dev = NULL;
+ struct list_head *head;
+ struct rcu_string *name;
+
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ cur_devices = fs_info->fs_devices;
+ while (cur_devices) {
+ head = &cur_devices->devices;
+ list_for_each_entry(dev, head, dev_list) {
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
+ }
+ cur_devices = cur_devices->seed;
+ }
+
+ if (first_dev) {
+ rcu_read_lock();
+ name = rcu_dereference(first_dev->name);
+ seq_escape(m, name->str, " \t\n\\");
+ rcu_read_unlock();
+ } else {
+ WARN_ON(1);
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return 0;
+}
+
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
+ .show_devname = btrfs_show_devname,
.write_inode = btrfs_write_inode,
.dirty_inode = btrfs_fs_dirty_inode,
.alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 36422254ef6..b72b068183e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
#include "locking.h"
#include "tree-log.h"
#include "inode-map.h"
+#include "volumes.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -55,49 +56,54 @@ static noinline void switch_commit_root(struct btrfs_root *root)
static noinline int join_transaction(struct btrfs_root *root, int nofail)
{
struct btrfs_transaction *cur_trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- spin_unlock(&root->fs_info->trans_lock);
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
- if (root->fs_info->trans_no_join) {
+ if (fs_info->trans_no_join) {
if (!nofail) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
}
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (cur_trans->aborted) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
atomic_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
- spin_lock(&root->fs_info->trans_lock);
- if (root->fs_info->running_transaction) {
+ spin_lock(&fs_info->trans_lock);
+ if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the trans_no_join checks above
*/
kmem_cache_free(btrfs_transaction_cachep, cur_trans);
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
goto loop;
+ } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ spin_unlock(&root->fs_info->trans_lock);
+ kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+ return -EROFS;
}
atomic_set(&cur_trans->num_writers, 1);
@@ -121,20 +127,38 @@ loop:
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
cur_trans->delayed_refs.seq = 1;
+
+ /*
+ * although the tree mod log is per file system and not per transaction,
+ * the log must never go across transaction boundaries.
+ */
+ smp_mb();
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+ "creating a fresh transaction\n");
+ WARN_ON(1);
+ }
+ if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
+ printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+ "creating a fresh transaction\n");
+ WARN_ON(1);
+ }
+ atomic_set(&fs_info->tree_mod_seq, 0);
+
init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
- list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+ list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
- root->fs_info->btree_inode->i_mapping);
- root->fs_info->generation++;
- cur_trans->transid = root->fs_info->generation;
- root->fs_info->running_transaction = cur_trans;
+ fs_info->btree_inode->i_mapping);
+ fs_info->generation++;
+ cur_trans->transid = fs_info->generation;
+ fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
@@ -758,6 +782,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_run_dev_stats(trans, root->fs_info);
+ BUG_ON(ret);
+
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
@@ -1190,14 +1217,20 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
static void cleanup_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, int err)
{
struct btrfs_transaction *cur_trans = trans->transaction;
WARN_ON(trans->use_count > 1);
+ btrfs_abort_transaction(trans, root, err);
+
spin_lock(&root->fs_info->trans_lock);
list_del_init(&cur_trans->list);
+ if (cur_trans == root->fs_info->running_transaction) {
+ root->fs_info->running_transaction = NULL;
+ root->fs_info->trans_no_join = 0;
+ }
spin_unlock(&root->fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, root);
@@ -1503,7 +1536,7 @@ cleanup_transaction:
// WARN_ON(1);
if (current->journal_info == trans)
current->journal_info = NULL;
- cleanup_transaction(trans, root);
+ cleanup_transaction(trans, root, ret);
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eb1ae908582..8abeae4224f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -690,6 +690,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
kfree(name);
iput(inode);
+
+ btrfs_run_delayed_items(trans, root);
return ret;
}
@@ -895,6 +897,7 @@ again:
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
+ btrfs_run_delayed_items(trans, root);
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
@@ -1475,6 +1478,9 @@ again:
ret = btrfs_unlink_inode(trans, root, dir, inode,
name, name_len);
BUG_ON(ret);
+
+ btrfs_run_delayed_items(trans, root);
+
kfree(name);
iput(inode);
@@ -1628,7 +1634,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen);
+ if (ret)
+ return ret;
level = btrfs_header_level(eb);
@@ -1749,7 +1757,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
@@ -1766,7 +1778,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
WARN_ON(*level <= 0);
if (path->nodes[*level-1])
@@ -2657,6 +2673,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
btrfs_release_path(path);
+ if (ret > 0)
+ ret = 0;
return ret;
}
@@ -3028,21 +3046,6 @@ out:
return ret;
}
-static int inode_in_log(struct btrfs_trans_handle *trans,
- struct inode *inode)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
-
- mutex_lock(&root->log_mutex);
- if (BTRFS_I(inode)->logged_trans == trans->transid &&
- BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
- ret = 1;
- mutex_unlock(&root->log_mutex);
- return ret;
-}
-
-
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -3083,7 +3086,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (inode_in_log(trans, inode)) {
+ if (btrfs_inode_in_log(inode, trans->transid)) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 12f5147bd2b..ab942f46b3d 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
*
* ulist = ulist_alloc();
* ulist_add(ulist, root);
- * elem = NULL;
+ * ULIST_ITER_INIT(&uiter);
*
- * while ((elem = ulist_next(ulist, elem)) {
+ * while ((elem = ulist_next(ulist, &uiter)) {
* for (all child nodes n in elem)
* ulist_add(ulist, n);
* do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
*
* The allocated ulist will be returned in an initialized state.
*/
-struct ulist *ulist_alloc(unsigned long gfp_mask)
+struct ulist *ulist_alloc(gfp_t gfp_mask)
{
struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
* unaltered.
*/
int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long gfp_mask)
+ gfp_t gfp_mask)
+{
+ return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long *old_aux, gfp_t gfp_mask)
{
int i;
for (i = 0; i < ulist->nnodes; ++i) {
- if (ulist->nodes[i].val == val)
+ if (ulist->nodes[i].val == val) {
+ if (old_aux)
+ *old_aux = ulist->nodes[i].aux;
return 0;
+ }
}
if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
- * @prev: previously returned element or %NULL to start iteration
+ * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
*
* Note: locking must be provided by the caller. In case of rwlocks only read
* locking is needed
*
- * This function is used to iterate an ulist. The iteration is started with
- * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
* end is reached. No guarantee is made with respect to the order in which
* the elements are returned. They might neither be returned in order of
* addition nor in ascending order.
* It is allowed to call ulist_add during an enumeration. Newly added items
* are guaranteed to show up in the running enumeration.
*/
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
{
- int next;
-
if (ulist->nnodes == 0)
return NULL;
-
- if (!prev)
- return &ulist->nodes[0];
-
- next = (prev - ulist->nodes) + 1;
- if (next < 0 || next >= ulist->nnodes)
+ if (uiter->i < 0 || uiter->i >= ulist->nnodes)
return NULL;
- return &ulist->nodes[next];
+ return &ulist->nodes[uiter->i++];
}
EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2e25dec58ec..21bdc8ec813 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
*/
#define ULIST_SIZE 16
+struct ulist_iterator {
+ int i;
+};
+
/*
* element of the list
*/
@@ -59,10 +63,15 @@ struct ulist {
void ulist_init(struct ulist *ulist);
void ulist_fini(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
-struct ulist *ulist_alloc(unsigned long gfp_mask);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
- unsigned long gfp_mask);
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+ gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long *old_aux, gfp_t gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist,
+ struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1411b99555a..ecaad40e7ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
@@ -34,11 +35,14 @@
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
+#include "rcu-string.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -61,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
kfree(fs_devices);
@@ -331,8 +335,8 @@ static noinline int device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
+ struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
- char *name;
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
@@ -361,15 +365,18 @@ static noinline int device_list_add(const char *path,
return -ENOMEM;
}
device->devid = devid;
+ device->dev_stats_valid = 0;
device->work.func = pending_bios_fn;
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
spin_lock_init(&device->io_lock);
- device->name = kstrdup(path, GFP_NOFS);
- if (!device->name) {
+
+ name = rcu_string_strdup(path, GFP_NOFS);
+ if (!name) {
kfree(device);
return -ENOMEM;
}
+ rcu_assign_pointer(device->name, name);
INIT_LIST_HEAD(&device->dev_alloc_list);
/* init readahead state */
@@ -386,12 +393,12 @@ static noinline int device_list_add(const char *path,
device->fs_devices = fs_devices;
fs_devices->num_devices++;
- } else if (!device->name || strcmp(device->name, path)) {
- name = kstrdup(path, GFP_NOFS);
+ } else if (!device->name || strcmp(device->name->str, path)) {
+ name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
- kfree(device->name);
- device->name = name;
+ rcu_string_free(device->name);
+ rcu_assign_pointer(device->name, name);
if (device->missing) {
fs_devices->missing_devices--;
device->missing = 0;
@@ -426,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+ struct rcu_string *name;
+
device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device)
goto error;
- device->name = kstrdup(orig_dev->name, GFP_NOFS);
- if (!device->name) {
+ /*
+ * This is ok to do without rcu read locked because we hold the
+ * uuid mutex so nothing we touch in here is going to disappear.
+ */
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
kfree(device);
goto error;
}
+ rcu_assign_pointer(device->name, name);
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
@@ -487,7 +501,7 @@ again:
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
@@ -512,7 +526,7 @@ static void __free_device(struct work_struct *work)
if (device->bdev)
blkdev_put(device->bdev, device->mode);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
}
@@ -536,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
+ struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
@@ -551,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
BUG_ON(!new_device); /* -ENOMEM */
memcpy(new_device, device, sizeof(*new_device));
- new_device->name = kstrdup(device->name, GFP_NOFS);
- BUG_ON(device->name && !new_device->name); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(device->name && !name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
new_device->bdev = NULL;
new_device->writeable = 0;
new_device->in_fs_metadata = 0;
@@ -617,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
if (!device->name)
continue;
- bdev = blkdev_get_by_path(device->name, flags, holder);
+ bdev = blkdev_get_by_path(device->name->str, flags, holder);
if (IS_ERR(bdev)) {
- printk(KERN_INFO "open %s failed\n", device->name);
+ printk(KERN_INFO "open %s failed\n", device->name->str);
goto error;
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -1628,12 +1646,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
struct block_device *bdev;
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
+ struct rcu_string *name;
u64 total_bytes;
int seeding_dev = 0;
int ret = 0;
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
- return -EINVAL;
+ return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
@@ -1667,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- device->name = kstrdup(device_path, GFP_NOFS);
- if (!device->name) {
+ name = rcu_string_strdup(device_path, GFP_NOFS);
+ if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
+ rcu_assign_pointer(device->name, name);
ret = find_next_devid(root, &device->devid);
if (ret) {
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
goto error;
}
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
ret = PTR_ERR(trans);
goto error;
@@ -1792,7 +1812,7 @@ error_trans:
unlock_chunks(root);
btrfs_abort_transaction(trans, root, ret);
btrfs_end_transaction(trans, root);
- kfree(device->name);
+ rcu_string_free(device->name);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2825,31 +2845,48 @@ out:
static int balance_kthread(void *data)
{
- struct btrfs_balance_control *bctl =
- (struct btrfs_balance_control *)data;
- struct btrfs_fs_info *fs_info = bctl->fs_info;
+ struct btrfs_fs_info *fs_info = data;
int ret = 0;
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
- set_balance_control(bctl);
-
- if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
- printk(KERN_INFO "btrfs: force skipping balance\n");
- } else {
+ if (fs_info->balance_ctl) {
printk(KERN_INFO "btrfs: continuing balance\n");
- ret = btrfs_balance(bctl, NULL);
+ ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+
return ret;
}
-int btrfs_recover_balance(struct btrfs_root *tree_root)
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
+
+ spin_lock(&fs_info->balance_lock);
+ if (!fs_info->balance_ctl) {
+ spin_unlock(&fs_info->balance_lock);
+ return 0;
+ }
+ spin_unlock(&fs_info->balance_lock);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ return 0;
+ }
+
+ tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+ if (IS_ERR(tsk))
+ return PTR_ERR(tsk);
+
+ return 0;
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
@@ -2862,29 +2899,30 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)
if (!path)
return -ENOMEM;
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
- if (!bctl) {
- ret = -ENOMEM;
- goto out;
- }
-
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out_bctl;
+ goto out;
if (ret > 0) { /* ret = -ENOENT; */
ret = 0;
- goto out_bctl;
+ goto out;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
- bctl->fs_info = tree_root->fs_info;
- bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+ bctl->fs_info = fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
btrfs_balance_data(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
@@ -2893,14 +2931,13 @@ int btrfs_recover_balance(struct btrfs_root *tree_root)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
- if (IS_ERR(tsk))
- ret = PTR_ERR(tsk);
- else
- goto out;
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
-out_bctl:
- kfree(bctl);
+ set_balance_control(bctl);
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
@@ -4001,13 +4038,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static void *merge_stripe_index_into_bio_private(void *bi_private,
+ unsigned int stripe_index)
+{
+ /*
+ * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
+ * at most 1.
+ * The alternative solution (instead of stealing bits from the
+ * pointer) would be to allocate an intermediate structure
+ * that contains the old private pointer plus the stripe_index.
+ */
+ BUG_ON((((uintptr_t)bi_private) & 3) != 0);
+ BUG_ON(stripe_index > 3);
+ return (void *)(((uintptr_t)bi_private) | stripe_index);
+}
+
+static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
+{
+ return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
+}
+
+static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
+{
+ return (unsigned int)((uintptr_t)bi_private) & 3;
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
- struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
int is_orig_bio = 0;
- if (err)
+ if (err) {
atomic_inc(&bbio->error);
+ if (err == -EIO || err == -EREMOTEIO) {
+ unsigned int stripe_index =
+ extract_stripe_index_from_bio_private(
+ bio->bi_private);
+ struct btrfs_device *dev;
+
+ BUG_ON(stripe_index >= bbio->num_stripes);
+ dev = bbio->stripes[stripe_index].dev;
+ if (dev->bdev) {
+ if (bio->bi_rw & WRITE)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ else
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_READ_ERRS);
+ if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+ btrfs_dev_stat_inc(dev,
+ BTRFS_DEV_STAT_FLUSH_ERRS);
+ btrfs_dev_stat_print_on_error(dev);
+ }
+ }
+ }
if (bio == bbio->orig_bio)
is_orig_bio = 1;
@@ -4149,14 +4233,23 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bio = first_bio;
}
bio->bi_private = bbio;
+ bio->bi_private = merge_stripe_index_into_bio_private(
+ bio->bi_private, (unsigned int)dev_nr);
bio->bi_end_io = btrfs_end_bio;
bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
dev = bbio->stripes[dev_nr].dev;
if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+#ifdef DEBUG
+ struct rcu_string *name;
+
+ rcu_read_lock();
+ name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
(u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
- dev->name, dev->devid, bio->bi_size);
+ name->str, dev->devid, bio->bi_size);
+ rcu_read_unlock();
+#endif
bio->bi_bdev = dev->bdev;
if (async_submit)
schedule_bio(root, dev, rw, bio);
@@ -4509,6 +4602,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
return ret;
}
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+ u64 logical, int mirror_num)
+{
+ struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ int ret;
+ u64 map_length = 0;
+ struct btrfs_bio *bbio = NULL;
+ struct btrfs_device *device;
+
+ BUG_ON(mirror_num == 0);
+ ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
+ mirror_num);
+ if (ret) {
+ BUG_ON(bbio != NULL);
+ return NULL;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
+ device = bbio->stripes[mirror_num - 1].dev;
+ kfree(bbio);
+ return device;
+}
+
int btrfs_read_chunk_tree(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -4583,3 +4698,231 @@ error:
btrfs_free_path(path);
return ret;
}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct extent_buffer *eb;
+ int slot;
+ int ret = 0;
+ struct btrfs_device *device;
+ struct btrfs_path *path = NULL;
+ int i;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ int item_size;
+ struct btrfs_dev_stats_item *ptr;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+ if (ret) {
+ printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
+ rcu_str_deref(device->name),
+ (unsigned long long)device->devid);
+ __btrfs_reset_dev_stats(device);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ continue;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot,
+ struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_reset(device, i);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dev_root,
+ struct btrfs_device *device)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_dev_stats_item *ptr;
+ int ret;
+ int i;
+
+ key.objectid = 0;
+ key.type = BTRFS_DEV_STATS_KEY;
+ key.offset = device->devid;
+
+ path = btrfs_alloc_path();
+ BUG_ON(!path);
+ ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+ if (ret < 0) {
+ printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+ ret, rcu_str_deref(device->name));
+ goto out;
+ }
+
+ if (ret == 0 &&
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ /* need to delete old one and insert a new one */
+ ret = btrfs_del_item(trans, dev_root, path);
+ if (ret != 0) {
+ printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ ret = 1;
+ }
+
+ if (ret == 1) {
+ /* need to insert a new item */
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, dev_root, path,
+ &key, sizeof(*ptr));
+ if (ret < 0) {
+ printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+ rcu_str_deref(device->name), ret);
+ goto out;
+ }
+ }
+
+ eb = path->nodes[0];
+ ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_set_dev_stats_value(eb, ptr, i,
+ btrfs_dev_stat_read(device, i));
+ btrfs_mark_buffer_dirty(eb);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->dev_stats_valid || !device->dev_stats_dirty)
+ continue;
+
+ ret = update_dev_stat_item(trans, dev_root, device);
+ if (!ret)
+ device->dev_stats_dirty = 0;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+ btrfs_dev_stat_inc(dev, index);
+ btrfs_dev_stat_print_on_error(dev);
+}
+
+void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+ if (!dev->dev_stats_valid)
+ return;
+ printk_ratelimited_in_rcu(KERN_ERR
+ "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev,
+ BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+ printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ rcu_str_deref(dev->name),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats,
+ int reset_after_read)
+{
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int i;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!dev) {
+ printk(KERN_WARNING
+ "btrfs: get dev_stats failed, device not found\n");
+ return -ENODEV;
+ } else if (!dev->dev_stats_valid) {
+ printk(KERN_WARNING
+ "btrfs: get dev_stats failed, not yet valid\n");
+ return -ENODEV;
+ } else if (reset_after_read) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (stats->nr_items > i)
+ stats->values[i] =
+ btrfs_dev_stat_read_and_reset(dev, i);
+ else
+ btrfs_dev_stat_reset(dev, i);
+ }
+ } else {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ if (stats->nr_items > i)
+ stats->values[i] = btrfs_dev_stat_read(dev, i);
+ }
+ if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+ stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bb6b03f97aa..95f6637614d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/sort.h>
#include "async-thread.h"
+#include "ioctl.h"
#define BTRFS_STRIPE_LEN (64 * 1024)
@@ -57,7 +58,7 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
- char *name;
+ struct rcu_string *name;
/* the internal btrfs device id */
u64 devid;
@@ -106,6 +107,11 @@ struct btrfs_device {
struct completion flush_wait;
int nobarriers;
+ /* disk I/O failure stats. For detailed description refer to
+ * enum btrfs_dev_stat_values in ioctl.h */
+ int dev_stats_valid;
+ int dev_stats_dirty; /* counters need to be written to disk */
+ atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
};
struct btrfs_fs_devices {
@@ -275,10 +281,57 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs);
-int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+ u64 logical, int mirror_num);
+void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+ struct btrfs_ioctl_get_dev_stats *stats,
+ int reset_after_read);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+ int index)
+{
+ atomic_inc(dev->dev_stat_values + index);
+ dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+ int index)
+{
+ return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+ int index)
+{
+ int ret;
+
+ ret = atomic_xchg(dev->dev_stat_values + index, 0);
+ dev->dev_stats_dirty = 1;
+ return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+ int index, unsigned long val)
+{
+ atomic_set(dev->dev_stat_values + index, val);
+ dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+ int index)
+{
+ btrfs_dev_stat_set(dev, index, 0);
+}
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e7a5659087e..3f4e2d69e83 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0bc1bed6c4e..c7062c896d7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3156,7 +3156,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
/*
* Buffer-head allocation
*/
-static struct kmem_cache *bh_cachep;
+static struct kmem_cache *bh_cachep __read_mostly;
/*
* Once the number of bh's in the machine exceeds this level, we start
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 173b1d22e59..8b67304e4b8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -54,7 +54,12 @@
(CONGESTION_ON_THRESH(congestion_kb) - \
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
-
+static inline struct ceph_snap_context *page_snap_context(struct page *page)
+{
+ if (PagePrivate(page))
+ return (void *)page->private;
+ return NULL;
+}
/*
* Dirty a page. Optimistically adjust accounting, on the assumption
@@ -142,10 +147,9 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
{
struct inode *inode;
struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = (void *)page->private;
+ struct ceph_snap_context *snapc = page_snap_context(page);
BUG_ON(!PageLocked(page));
- BUG_ON(!page->private);
BUG_ON(!PagePrivate(page));
BUG_ON(!page->mapping);
@@ -182,7 +186,6 @@ static int ceph_releasepage(struct page *page, gfp_t g)
struct inode *inode = page->mapping ? page->mapping->host : NULL;
dout("%p releasepage %p idx %lu\n", inode, page, page->index);
WARN_ON(PageDirty(page));
- WARN_ON(page->private);
WARN_ON(PagePrivate(page));
return 0;
}
@@ -443,7 +446,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
osdc = &fsc->client->osdc;
/* verify this is a writeable snap context */
- snapc = (void *)page->private;
+ snapc = page_snap_context(page);
if (snapc == NULL) {
dout("writepage %p page %p not dirty?\n", inode, page);
goto out;
@@ -451,7 +454,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
oldest = get_oldest_context(inode, &snap_size);
if (snapc->seq > oldest->seq) {
dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, (void *)page->private);
+ inode, page, snapc);
/* we should only noop if called by kswapd */
WARN_ON((current->flags & PF_MEMALLOC) == 0);
ceph_put_snap_context(oldest);
@@ -591,7 +594,7 @@ static void writepages_finish(struct ceph_osd_request *req,
clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
- ceph_put_snap_context((void *)page->private);
+ ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
dout("unlocking %d %p\n", i, page);
@@ -795,7 +798,7 @@ get_more_pages:
}
/* only if matching snap context */
- pgsnapc = (void *)page->private;
+ pgsnapc = page_snap_context(page);
if (pgsnapc->seq > snapc->seq) {
dout("page snapc %p %lld > oldest %p %lld\n",
pgsnapc, pgsnapc->seq, snapc, snapc->seq);
@@ -984,7 +987,7 @@ retry_locked:
BUG_ON(!ci->i_snap_realm);
down_read(&mdsc->snap_rwsem);
BUG_ON(!ci->i_snap_realm->cached_context);
- snapc = (void *)page->private;
+ snapc = page_snap_context(page);
if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fbb2a643ef1..8e1b60e557b 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -40,38 +40,49 @@ struct ceph_nfs_confh {
u32 parent_name_hash;
} __attribute__ ((packed));
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
- int connectable)
+/*
+ * The presence of @parent_inode here tells us whether NFS wants a
+ * connectable file handle. However, we want to make a connectionable
+ * file handle unconditionally so that the MDS gets as much of a hint
+ * as possible. That means we only use @parent_dentry to indicate
+ * whether nfsd wants a connectable fh, and whether we should indicate
+ * failure from a too-small @max_len.
+ */
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+ struct inode *parent_inode)
{
int type;
struct ceph_nfs_fh *fh = (void *)rawfh;
struct ceph_nfs_confh *cfh = (void *)rawfh;
- struct dentry *parent;
- struct inode *inode = dentry->d_inode;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
+ struct dentry *dentry = d_find_alias(inode);
+ struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
- if (*max_len >= connected_handle_length) {
+ /* if we found an alias, generate a connectable fh */
+ if (*max_len >= connected_handle_length && dentry) {
dout("encode_fh %p connectable\n", dentry);
- cfh->ino = ceph_ino(dentry->d_inode);
+ spin_lock(&dentry->d_lock);
+ parent = dentry->d_parent;
+ cfh->ino = ceph_ino(inode);
cfh->parent_ino = ceph_ino(parent->d_inode);
cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
dentry);
*max_len = connected_handle_length;
type = 2;
+ spin_unlock(&dentry->d_lock);
} else if (*max_len >= handle_length) {
- if (connectable) {
+ if (parent_inode) {
+ /* nfsd wants connectable */
*max_len = connected_handle_length;
type = 255;
} else {
dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(dentry->d_inode);
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
type = 1;
}
@@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
- spin_unlock(&dentry->d_lock);
return type;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed72428d9c7..988d4f302e4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -54,7 +54,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
req->r_fmode = ceph_flags_to_mode(flags);
req->r_args.open.flags = cpu_to_le32(flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
- req->r_args.open.preferred = cpu_to_le32(-1);
out:
return req;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 790914a598d..8e3fb69fbe6 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -26,8 +26,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
l.object_size = ceph_file_layout_object_size(ci->i_layout);
l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- l.preferred_osd =
- (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+ l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
}
@@ -35,6 +34,32 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
return err;
}
+static long __validate_layout(struct ceph_mds_client *mdsc,
+ struct ceph_ioctl_layout *l)
+{
+ int i, err;
+
+ /* validate striping parameters */
+ if ((l->object_size & ~PAGE_MASK) ||
+ (l->stripe_unit & ~PAGE_MASK) ||
+ ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)
+ return err;
+
+ return 0;
+}
+
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file->f_dentry->d_inode;
@@ -44,52 +69,40 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
struct ceph_ioctl_layout l;
struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
struct ceph_ioctl_layout nl;
- int err, i;
+ int err;
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
/* validate changed params against current layout */
err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
- if (!err) {
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
- nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- nl.preferred_osd =
- (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
- } else
+ if (err)
return err;
+ memset(&nl, 0, sizeof(nl));
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
+ else
+ nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
+ else
+ nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
if (l.object_size)
nl.object_size = l.object_size;
+ else
+ nl.object_size = ceph_file_layout_object_size(ci->i_layout);
if (l.data_pool)
nl.data_pool = l.data_pool;
- if (l.preferred_osd)
- nl.preferred_osd = l.preferred_osd;
+ else
+ nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
- if ((nl.object_size & ~PAGE_MASK) ||
- (nl.stripe_unit & ~PAGE_MASK) ||
- ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
- return -EINVAL;
+ /* this is obsolete, and always -1 */
+ nl.preferred_osd = le64_to_cpu(-1);
- /* make sure it's a valid data pool */
- if (l.data_pool > 0) {
- mutex_lock(&mdsc->mutex);
- err = -EINVAL;
- for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
- if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
- err = 0;
- break;
- }
- mutex_unlock(&mdsc->mutex);
- if (err)
- return err;
- }
+ err = __validate_layout(mdsc, &nl);
+ if (err)
+ return err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
USE_AUTH_MDS);
@@ -106,8 +119,6 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
req->r_args.setlayout.layout.fl_object_size =
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- req->r_args.setlayout.layout.fl_pg_preferred =
- cpu_to_le32(l.preferred_osd);
parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
@@ -127,33 +138,16 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
struct inode *inode = file->f_dentry->d_inode;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
- int err, i;
+ int err;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
/* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
return -EFAULT;
- if ((l.object_size & ~PAGE_MASK) ||
- (l.stripe_unit & ~PAGE_MASK) ||
- !l.stripe_unit ||
- (l.object_size &&
- (unsigned)l.object_size % (unsigned)l.stripe_unit))
- return -EINVAL;
-
- /* make sure it's a valid data pool */
- if (l.data_pool > 0) {
- mutex_lock(&mdsc->mutex);
- err = -EINVAL;
- for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
- if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
- err = 0;
- break;
- }
- mutex_unlock(&mdsc->mutex);
- if (err)
- return err;
- }
+ err = __validate_layout(mdsc, &l);
+ if (err)
+ return err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
USE_AUTH_MDS);
@@ -171,8 +165,6 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool =
cpu_to_le32(l.data_pool);
- req->r_args.setlayout.layout.fl_pg_preferred =
- cpu_to_le32(l.preferred_osd);
err = ceph_mdsc_do_request(mdsc, inode, req);
ceph_mdsc_put_request(req);
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index be4a6048733..c77028afb1e 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -34,6 +34,8 @@
struct ceph_ioctl_layout {
__u64 stripe_unit, stripe_count, object_size;
__u64 data_pool;
+
+ /* obsolete. new values ignored, always return -1 */
__s64 preferred_osd;
};
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 89971e137aa..200bc87eceb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -334,10 +334,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
dout("mdsc put_session %p %d -> %d\n", s,
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) {
- if (s->s_authorizer)
+ if (s->s_auth.authorizer)
s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
s->s_mdsc->fsc->client->monc.auth,
- s->s_authorizer);
+ s->s_auth.authorizer);
kfree(s);
}
}
@@ -3395,39 +3395,33 @@ out:
/*
* authentication
*/
-static int get_authorizer(struct ceph_connection *con,
- void **buf, int *len, int *proto,
- void **reply_buf, int *reply_len, int force_new)
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately. Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+ int *proto, int force_new)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- int ret = 0;
-
- if (force_new && s->s_authorizer) {
- ac->ops->destroy_authorizer(ac, s->s_authorizer);
- s->s_authorizer = NULL;
- }
- if (s->s_authorizer == NULL) {
- if (ac->ops->create_authorizer) {
- ret = ac->ops->create_authorizer(
- ac, CEPH_ENTITY_TYPE_MDS,
- &s->s_authorizer,
- &s->s_authorizer_buf,
- &s->s_authorizer_buf_len,
- &s->s_authorizer_reply_buf,
- &s->s_authorizer_reply_buf_len);
- if (ret)
- return ret;
- }
- }
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ if (force_new && auth->authorizer) {
+ if (ac->ops && ac->ops->destroy_authorizer)
+ ac->ops->destroy_authorizer(ac, auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
+ int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ }
*proto = ac->protocol;
- *buf = s->s_authorizer_buf;
- *len = s->s_authorizer_buf_len;
- *reply_buf = s->s_authorizer_reply_buf;
- *reply_len = s->s_authorizer_reply_buf_len;
- return 0;
+
+ return auth;
}
@@ -3437,7 +3431,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+ return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len);
}
static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 8c7c04ebb59..dd26846dd71 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -11,6 +11,7 @@
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/auth.h>
/*
* Some lock dependencies:
@@ -113,9 +114,7 @@ struct ceph_mds_session {
struct ceph_connection s_con;
- struct ceph_authorizer *s_authorizer;
- void *s_authorizer_buf, *s_authorizer_reply_buf;
- size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
+ struct ceph_auth_handshake s_auth;
/* protected by s_gen_ttl_lock */
spinlock_t s_gen_ttl_lock;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f04c0961f99..e5206fc7656 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
/* alloc new snap context */
err = -ENOMEM;
- if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
+ if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
goto fail;
snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
if (!snapc)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 35b86331d8a..785cb3057c9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -118,15 +118,6 @@ static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-
- if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
- val += ret;
- size -= ret;
- ret += snprintf(val, size, "preferred_osd=%lld\n",
- (unsigned long long)ceph_file_layout_pg_preferred(
- ci->i_layout));
- }
-
return ret;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2b243af70aa..a08306a8bec 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -158,3 +158,23 @@ config CIFS_NFSD_EXPORT
depends on CIFS && EXPERIMENTAL && BROKEN
help
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+
+config CIFS_SMB2
+ bool "SMB2 network file system support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && INET && BROKEN
+ select NLS
+ select KEYS
+ select FSCACHE
+ select DNS_RESOLVER
+
+ help
+ This enables experimental support for the SMB2 (Server Message Block
+ version 2) protocol. The SMB2 protocol is the successor to the
+ popular CIFS and SMB network file sharing protocols. SMB2 is the
+ native file sharing mechanism for recent versions of Windows
+ operating systems (since Vista). SMB2 enablement will eventually
+ allow users better performance, security and features, than would be
+ possible with cifs. Note that smb2 mount options also are simpler
+ (compared to cifs) due to protocol improvements.
+
+ Unless you are a developer or tester, say N.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 005d524c3a4..4b412754434 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
- readdir.o ioctl.o sess.o export.o
+ readdir.o ioctl.o sess.o export.o smb1ops.o
cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
@@ -15,3 +15,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB2) += smb2ops.o
diff --git a/fs/cifs/README b/fs/cifs/README
index b7d782bab79..22ab7b5b8da 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -608,11 +608,6 @@ Stats Lists summary resource usage information as well as per
in the kernel configuration.
Configuration pseudo-files:
-MultiuserMount If set to one, more than one CIFS session to
- the same server ip address can be established
- if more than one uid accesses the same mount
- point and if the uids user/password mapping
- information is available. (default is 0)
PacketSigningEnabled If set to one, cifs packet signing is enabled
and will be used if the server requires
it. If set to two, cifs packet signing is
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 27046462941..e8140528ca5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -57,19 +57,21 @@ cifs_dump_mem(char *label, void *data, int length)
}
}
-#ifdef CONFIG_CIFS_DEBUG2
void cifs_dump_detail(void *buf)
{
+#ifdef CONFIG_CIFS_DEBUG2
struct smb_hdr *smb = (struct smb_hdr *)buf;
cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
+#endif /* CONFIG_CIFS_DEBUG2 */
}
void cifs_dump_mids(struct TCP_Server_Info *server)
{
+#ifdef CONFIG_CIFS_DEBUG2
struct list_head *tmp;
struct mid_q_entry *mid_entry;
@@ -102,8 +104,8 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
}
}
spin_unlock(&GlobalMid_Lock);
-}
#endif /* CONFIG_CIFS_DEBUG2 */
+}
#ifdef CONFIG_PROC_FS
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
@@ -420,7 +422,6 @@ static struct proc_dir_entry *proc_fs_cifs;
static const struct file_operations cifsFYI_proc_fops;
static const struct file_operations cifs_lookup_cache_proc_fops;
static const struct file_operations traceSMB_proc_fops;
-static const struct file_operations cifs_multiuser_mount_proc_fops;
static const struct file_operations cifs_security_flags_proc_fops;
static const struct file_operations cifs_linux_ext_proc_fops;
@@ -440,8 +441,6 @@ cifs_proc_init(void)
proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
&cifs_linux_ext_proc_fops);
- proc_create("MultiuserMount", 0, proc_fs_cifs,
- &cifs_multiuser_mount_proc_fops);
proc_create("SecurityFlags", 0, proc_fs_cifs,
&cifs_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
@@ -460,7 +459,6 @@ cifs_proc_clean(void)
#ifdef CONFIG_CIFS_STATS
remove_proc_entry("Stats", proc_fs_cifs);
#endif
- remove_proc_entry("MultiuserMount", proc_fs_cifs);
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
@@ -617,52 +615,6 @@ static const struct file_operations traceSMB_proc_fops = {
.write = traceSMB_proc_write,
};
-static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%d\n", multiuser_mount);
- return 0;
-}
-
-static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
-{
- return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
-}
-
-static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *ppos)
-{
- char c;
- int rc;
- static bool warned;
-
- rc = get_user(c, buffer);
- if (rc)
- return rc;
- if (c == '0' || c == 'n' || c == 'N')
- multiuser_mount = 0;
- else if (c == '1' || c == 'y' || c == 'Y') {
- multiuser_mount = 1;
- if (!warned) {
- warned = true;
- printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
- "mount code is scheduled to be deprecated in "
- "3.5. Please switch to using the multiuser "
- "mount option.");
- }
- }
-
- return count;
-}
-
-static const struct file_operations cifs_multiuser_mount_proc_fops = {
- .owner = THIS_MODULE,
- .open = cifs_multiuser_mount_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = cifs_multiuser_mount_proc_write,
-};
-
static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
{
seq_printf(m, "0x%x\n", global_secflags);
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 566e0ae8dc2..c0c68bb492d 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -24,10 +24,10 @@
#define _H_CIFS_DEBUG
void cifs_dump_mem(char *label, void *data, int length);
-#ifdef CONFIG_CIFS_DEBUG2
-#define DBG2 2
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
+#ifdef CONFIG_CIFS_DEBUG2
+#define DBG2 2
#else
#define DBG2 0
#endif
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 541ef81f6ae..8b6e344eb0b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -56,7 +56,6 @@ int traceSMB = 0;
bool enable_oplocks = true;
unsigned int linuxExtEnabled = 1;
unsigned int lookupCacheEnabled = 1;
-unsigned int multiuser_mount = 0;
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
@@ -125,7 +124,7 @@ cifs_read_super(struct super_block *sb)
goto out_no_root;
}
- /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+ /* do that *after* d_make_root() - we want NULL ->d_op for root here */
if (cifs_sb_master_tcon(cifs_sb)->nocase)
sb->s_d_op = &cifs_ci_dentry_ops;
else
@@ -272,7 +271,7 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
+ clear_inode(inode);
cifs_fscache_release_inode_cookie(inode);
}
@@ -329,6 +328,19 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
seq_printf(s, "i");
}
+static void
+cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
+{
+ seq_printf(s, ",cache=");
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+ seq_printf(s, "strict");
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+ seq_printf(s, "none");
+ else
+ seq_printf(s, "loose");
+}
+
/*
* cifs_show_options() is for displaying mount options in /proc/mounts.
* Not all settable options are displayed but most of the important
@@ -342,7 +354,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
+ seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
cifs_show_security(s, tcon->ses->server);
+ cifs_show_cache_flavor(s, cifs_sb);
seq_printf(s, ",unc=%s", tcon->treeName);
@@ -408,8 +422,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rwpidforward");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
seq_printf(s, ",forcemand");
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
- seq_printf(s, ",directio");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
seq_printf(s, ",nouser_xattr");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
@@ -432,8 +444,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",nostrictsync");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
seq_printf(s, ",noperm");
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
- seq_printf(s, ",strictcache");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
@@ -945,7 +955,6 @@ cifs_init_once(void *inode)
struct cifsInodeInfo *cifsi = inode;
inode_init_once(&cifsi->vfs_inode);
- INIT_LIST_HEAD(&cifsi->llist);
mutex_init(&cifsi->lock_mutex);
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 73fea285b84..6df0cbe1cbc 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -151,6 +151,58 @@ struct cifs_cred {
*****************************************************************
*/
+enum smb_version {
+ Smb_1 = 1,
+ Smb_21,
+};
+
+struct mid_q_entry;
+struct TCP_Server_Info;
+struct cifsFileInfo;
+struct cifs_ses;
+
+struct smb_version_operations {
+ int (*send_cancel)(struct TCP_Server_Info *, void *,
+ struct mid_q_entry *);
+ bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
+ /* setup request: allocate mid, sign message */
+ int (*setup_request)(struct cifs_ses *, struct kvec *, unsigned int,
+ struct mid_q_entry **);
+ /* check response: verify signature, map error */
+ int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
+ bool);
+ void (*add_credits)(struct TCP_Server_Info *, const unsigned int);
+ void (*set_credits)(struct TCP_Server_Info *, const int);
+ int * (*get_credits_field)(struct TCP_Server_Info *);
+ __u64 (*get_next_mid)(struct TCP_Server_Info *);
+ /* data offset from read response message */
+ unsigned int (*read_data_offset)(char *);
+ /* data length from read response message */
+ unsigned int (*read_data_length)(char *);
+ /* map smb to linux error */
+ int (*map_error)(char *, bool);
+ /* find mid corresponding to the response message */
+ struct mid_q_entry * (*find_mid)(struct TCP_Server_Info *, char *);
+ void (*dump_detail)(void *);
+ /* verify the message */
+ int (*check_message)(char *, unsigned int);
+ bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+};
+
+struct smb_version_values {
+ char *version_string;
+ __u32 large_lock_type;
+ __u32 exclusive_lock_type;
+ __u32 shared_lock_type;
+ __u32 unlock_lock_type;
+ size_t header_size;
+ size_t max_header_size;
+ size_t read_rsp_size;
+};
+
+#define HEADER_SIZE(server) (server->vals->header_size)
+#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+
struct smb_vol {
char *username;
char *password;
@@ -206,6 +258,8 @@ struct smb_vol {
bool sockopt_tcp_nodelay:1;
unsigned short int port;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
+ struct smb_version_operations *ops;
+ struct smb_version_values *vals;
char *prepath;
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
@@ -243,6 +297,8 @@ struct TCP_Server_Info {
int srv_count; /* reference counter */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
+ struct smb_version_operations *ops;
+ struct smb_version_values *vals;
enum statusEnum tcpStatus; /* what we think the status is */
char *hostname; /* hostname portion of UNC string */
struct socket *ssocket;
@@ -322,16 +378,6 @@ in_flight(struct TCP_Server_Info *server)
return num;
}
-static inline int*
-get_credits_field(struct TCP_Server_Info *server)
-{
- /*
- * This will change to switch statement when we reserve slots for echos
- * and oplock breaks.
- */
- return &server->credits;
-}
-
static inline bool
has_credits(struct TCP_Server_Info *server, int *credits)
{
@@ -342,16 +388,22 @@ has_credits(struct TCP_Server_Info *server, int *credits)
return num > 0;
}
-static inline size_t
-header_size(void)
+static inline void
+add_credits(struct TCP_Server_Info *server, const unsigned int add)
{
- return sizeof(struct smb_hdr);
+ server->ops->add_credits(server, add);
}
-static inline size_t
-max_header_size(void)
+static inline void
+set_credits(struct TCP_Server_Info *server, const int val)
{
- return MAX_CIFS_HDR_SIZE;
+ server->ops->set_credits(server, val);
+}
+
+static inline __u64
+get_next_mid(struct TCP_Server_Info *server)
+{
+ return server->ops->get_next_mid(server);
}
/*
@@ -548,8 +600,7 @@ struct cifsLockInfo {
__u64 offset;
__u64 length;
__u32 pid;
- __u8 type;
- __u16 netfid;
+ __u32 type;
};
/*
@@ -574,6 +625,10 @@ struct cifs_search_info {
struct cifsFileInfo {
struct list_head tlist; /* pointer to next fid owned by tcon */
struct list_head flist; /* next fid (file instance) for this inode */
+ struct list_head llist; /*
+ * brlocks held by this fid, protected by
+ * lock_mutex from cifsInodeInfo structure
+ */
unsigned int uid; /* allows finding which FileInfo structure */
__u32 pid; /* process id who opened file */
__u16 netfid; /* file id from remote */
@@ -616,9 +671,12 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
*/
struct cifsInodeInfo {
- struct list_head llist; /* brlocks for this inode */
bool can_cache_brlcks;
- struct mutex lock_mutex; /* protect two fields above */
+ struct mutex lock_mutex; /*
+ * protect the field above and llist
+ * from every cifsFileInfo structure
+ * from openFileList
+ */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -704,7 +762,6 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
#endif
-struct mid_q_entry;
/*
* This is the prototype for the mid receive function. This function is for
@@ -1043,12 +1100,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
-GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
- to be established on existing mount if we
- have the uid/password or Kerberos credential
- or equivalent for current user */
-/* enable or disable oplocks */
-GLOBAL_EXTERN bool enable_oplocks;
+GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
GLOBAL_EXTERN unsigned int lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
@@ -1075,4 +1127,11 @@ void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
+/* Operations for different SMB versions */
+#define SMB1_VERSION_STRING "1.0"
+extern struct smb_version_operations smb1_operations;
+extern struct smb_version_values smb1_values;
+#define SMB21_VERSION_STRING "2.1"
+extern struct smb_version_operations smb21_operations;
+extern struct smb_version_values smb21_values;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 97f5d0371cb..0a6cbfe2761 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,6 +78,8 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
int * /* bytes returned */ , const int long_op);
extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
char *in_buf, int flags);
+extern int cifs_setup_request(struct cifs_ses *, struct kvec *, unsigned int,
+ struct mid_q_entry **);
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
@@ -88,9 +90,6 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *in_buf ,
struct smb_hdr *out_buf,
int *bytes_returned);
-extern void cifs_add_credits(struct TCP_Server_Info *server,
- const unsigned int add);
-extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);
extern int checkSMB(char *buf, unsigned int length);
extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
extern bool backup_cred(struct cifs_sb_info *);
@@ -115,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
void **request_buf);
extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
-extern __u64 GetNextMid(struct TCP_Server_Info *server);
extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
@@ -466,6 +464,9 @@ extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
/* asynchronous read support */
struct cifs_readdata {
+ struct kref refcount;
+ struct list_head list;
+ struct completion done;
struct cifsFileInfo *cfile;
struct address_space *mapping;
__u64 offset;
@@ -474,12 +475,13 @@ struct cifs_readdata {
int result;
struct list_head pages;
struct work_struct work;
+ int (*marshal_iov) (struct cifs_readdata *rdata,
+ unsigned int remaining);
unsigned int nr_iov;
struct kvec iov[1];
};
-struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
-void cifs_readdata_free(struct cifs_readdata *rdata);
+void cifs_readdata_release(struct kref *refcount);
int cifs_async_readv(struct cifs_readdata *rdata);
/* asynchronous write support */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6b79efd7d75..4ee522b3f66 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -86,8 +86,31 @@ static struct {
#endif /* CONFIG_CIFS_WEAK_PW_HASH */
#endif /* CIFS_POSIX */
-/* Forward declarations */
-static void cifs_readv_complete(struct work_struct *work);
+#ifdef CONFIG_HIGHMEM
+/*
+ * On arches that have high memory, kmap address space is limited. By
+ * serializing the kmap operations on those arches, we ensure that we don't
+ * end up with a bunch of threads in writeback with partially mapped page
+ * arrays, stuck waiting for kmap to come back. That situation prevents
+ * progress and can deadlock.
+ */
+static DEFINE_MUTEX(cifs_kmap_mutex);
+
+static inline void
+cifs_kmap_lock(void)
+{
+ mutex_lock(&cifs_kmap_mutex);
+}
+
+static inline void
+cifs_kmap_unlock(void)
+{
+ mutex_unlock(&cifs_kmap_mutex);
+}
+#else /* !CONFIG_HIGHMEM */
+#define cifs_kmap_lock() do { ; } while(0)
+#define cifs_kmap_unlock() do { ; } while(0)
+#endif /* CONFIG_HIGHMEM */
/* Mark as invalid, all open files on tree connections since they
were closed when session to server was lost */
@@ -269,7 +292,7 @@ small_smb_init_no_tc(const int smb_command, const int wct,
return rc;
buffer = (struct smb_hdr *)*request_buf;
- buffer->Mid = GetNextMid(ses->server);
+ buffer->Mid = get_next_mid(ses->server);
if (ses->capabilities & CAP_UNICODE)
buffer->Flags2 |= SMBFLG2_UNICODE;
if (ses->capabilities & CAP_STATUS32)
@@ -403,7 +426,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
cFYI(1, "secFlags 0x%x", secFlags);
- pSMB->hdr.Mid = GetNextMid(server);
+ pSMB->hdr.Mid = get_next_mid(server);
pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
@@ -461,7 +484,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
server->maxReq = min_t(unsigned int,
le16_to_cpu(rsp->MaxMpxCount),
cifs_max_pending);
- cifs_set_credits(server, server->maxReq);
+ set_credits(server, server->maxReq);
server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
/* even though we do not use raw we might as well set this
@@ -569,7 +592,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
little endian */
server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
cifs_max_pending);
- cifs_set_credits(server, server->maxReq);
+ set_credits(server, server->maxReq);
/* probably no need to store and check maxvcs */
server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
@@ -721,7 +744,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = mid->callback_data;
DeleteMidQEntry(mid);
- cifs_add_credits(server, 1);
+ add_credits(server, 1);
}
int
@@ -783,7 +806,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses)
return rc;
}
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
if (ses->server->sec_mode &
(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
@@ -1385,28 +1408,6 @@ openRetry:
return rc;
}
-struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages)
-{
- struct cifs_readdata *rdata;
-
- /* readdata + 1 kvec for each page */
- rdata = kzalloc(sizeof(*rdata) +
- sizeof(struct kvec) * nr_pages, GFP_KERNEL);
- if (rdata != NULL) {
- INIT_WORK(&rdata->work, cifs_readv_complete);
- INIT_LIST_HEAD(&rdata->pages);
- }
- return rdata;
-}
-
-void
-cifs_readdata_free(struct cifs_readdata *rdata)
-{
- cifsFileInfo_put(rdata->cfile);
- kfree(rdata);
-}
-
/*
* Discard any remaining data in the current SMB. To do this, we borrow the
* current bigbuf.
@@ -1423,7 +1424,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
length = cifs_read_from_socket(server, server->bigbuf,
min_t(unsigned int, remaining,
- CIFSMaxBufSize + max_header_size()));
+ CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
if (length < 0)
return length;
server->total_read += length;
@@ -1434,38 +1435,14 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return 0;
}
-static inline size_t
-read_rsp_size(void)
-{
- return sizeof(READ_RSP);
-}
-
-static inline unsigned int
-read_data_offset(char *buf)
-{
- READ_RSP *rsp = (READ_RSP *)buf;
- return le16_to_cpu(rsp->DataOffset);
-}
-
-static inline unsigned int
-read_data_length(char *buf)
-{
- READ_RSP *rsp = (READ_RSP *)buf;
- return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
- le16_to_cpu(rsp->DataLength);
-}
-
static int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
int length, len;
- unsigned int data_offset, remaining, data_len;
+ unsigned int data_offset, data_len;
struct cifs_readdata *rdata = mid->callback_data;
char *buf = server->smallbuf;
unsigned int buflen = get_rfc1002_length(buf) + 4;
- u64 eof;
- pgoff_t eof_index;
- struct page *page, *tpage;
cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__,
mid->mid, rdata->offset, rdata->bytes);
@@ -1475,9 +1452,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* can if there's not enough data. At this point, we've read down to
* the Mid.
*/
- len = min_t(unsigned int, buflen, read_rsp_size()) - header_size() + 1;
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+ HEADER_SIZE(server) + 1;
- rdata->iov[0].iov_base = buf + header_size() - 1;
+ rdata->iov[0].iov_base = buf + HEADER_SIZE(server) - 1;
rdata->iov[0].iov_len = len;
length = cifs_readv_from_socket(server, rdata->iov, 1, len);
@@ -1486,7 +1464,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->total_read += length;
/* Was the SMB read successful? */
- rdata->result = map_smb_to_linux_error(buf, false);
+ rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
cFYI(1, "%s: server returned error %d", __func__,
rdata->result);
@@ -1494,14 +1472,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
/* Is there enough to get to the rest of the READ_RSP header? */
- if (server->total_read < read_rsp_size()) {
+ if (server->total_read < server->vals->read_rsp_size) {
cFYI(1, "%s: server returned short header. got=%u expected=%zu",
- __func__, server->total_read, read_rsp_size());
+ __func__, server->total_read,
+ server->vals->read_rsp_size);
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
}
- data_offset = read_data_offset(buf) + 4;
+ data_offset = server->ops->read_data_offset(buf) + 4;
if (data_offset < server->total_read) {
/*
* win2k8 sometimes sends an offset of 0 when the read
@@ -1540,7 +1519,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
rdata->iov[0].iov_base, rdata->iov[0].iov_len);
/* how much data is in the response? */
- data_len = read_data_length(buf);
+ data_len = server->ops->read_data_length(buf);
if (data_offset + data_len > buflen) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
@@ -1548,64 +1527,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
/* marshal up the page array */
- len = 0;
- remaining = data_len;
- rdata->nr_iov = 1;
-
- /* determine the eof that the server (probably) has */
- eof = CIFS_I(rdata->mapping->host)->server_eof;
- eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
- cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
-
- list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
- if (remaining >= PAGE_CACHE_SIZE) {
- /* enough data to fill the page */
- rdata->iov[rdata->nr_iov].iov_base = kmap(page);
- rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
- cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
- rdata->nr_iov, page->index,
- rdata->iov[rdata->nr_iov].iov_base,
- rdata->iov[rdata->nr_iov].iov_len);
- ++rdata->nr_iov;
- len += PAGE_CACHE_SIZE;
- remaining -= PAGE_CACHE_SIZE;
- } else if (remaining > 0) {
- /* enough for partial page, fill and zero the rest */
- rdata->iov[rdata->nr_iov].iov_base = kmap(page);
- rdata->iov[rdata->nr_iov].iov_len = remaining;
- cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
- rdata->nr_iov, page->index,
- rdata->iov[rdata->nr_iov].iov_base,
- rdata->iov[rdata->nr_iov].iov_len);
- memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
- '\0', PAGE_CACHE_SIZE - remaining);
- ++rdata->nr_iov;
- len += remaining;
- remaining = 0;
- } else if (page->index > eof_index) {
- /*
- * The VFS will not try to do readahead past the
- * i_size, but it's possible that we have outstanding
- * writes with gaps in the middle and the i_size hasn't
- * caught up yet. Populate those with zeroed out pages
- * to prevent the VFS from repeatedly attempting to
- * fill them until the writes are flushed.
- */
- zero_user(page, 0, PAGE_CACHE_SIZE);
- list_del(&page->lru);
- lru_cache_add_file(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- page_cache_release(page);
- } else {
- /* no need to hold page hostage */
- list_del(&page->lru);
- lru_cache_add_file(page);
- unlock_page(page);
- page_cache_release(page);
- }
- }
+ cifs_kmap_lock();
+ len = rdata->marshal_iov(rdata, data_len);
+ cifs_kmap_unlock();
+ data_len -= len;
/* issue the read if we have any iovecs left to fill */
if (rdata->nr_iov > 1) {
@@ -1621,7 +1546,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
rdata->bytes = length;
cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
- buflen, remaining);
+ buflen, data_len);
/* discard anything left over */
if (server->total_read < buflen)
@@ -1632,33 +1557,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
static void
-cifs_readv_complete(struct work_struct *work)
-{
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
- struct page *page, *tpage;
-
- list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
- list_del(&page->lru);
- lru_cache_add_file(page);
-
- if (rdata->result == 0) {
- kunmap(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
- }
-
- unlock_page(page);
-
- if (rdata->result == 0)
- cifs_readpage_to_fscache(rdata->mapping->host, page);
-
- page_cache_release(page);
- }
- cifs_readdata_free(rdata);
-}
-
-static void
cifs_readv_callback(struct mid_q_entry *mid)
{
struct cifs_readdata *rdata = mid->callback_data;
@@ -1691,7 +1589,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &rdata->work);
DeleteMidQEntry(mid);
- cifs_add_credits(server, 1);
+ add_credits(server, 1);
}
/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -1744,12 +1642,15 @@ cifs_async_readv(struct cifs_readdata *rdata)
rdata->iov[0].iov_base = smb;
rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+ kref_get(&rdata->refcount);
rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
cifs_readv_receive, cifs_readv_callback,
rdata, false);
if (rc == 0)
cifs_stats_inc(&tcon->num_reads);
+ else
+ kref_put(&rdata->refcount, cifs_readdata_release);
cifs_small_buf_release(smb);
return rc;
@@ -2135,7 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
queue_work(cifsiod_wq, &wdata->work);
DeleteMidQEntry(mid);
- cifs_add_credits(tcon->ses->server, 1);
+ add_credits(tcon->ses->server, 1);
}
/* cifs_async_writev - send an async write, and set up mid to handle result */
@@ -2194,7 +2095,9 @@ cifs_async_writev(struct cifs_writedata *wdata)
* and set the iov_len properly for each one. It may also set
* wdata->bytes too.
*/
+ cifs_kmap_lock();
wdata->marshal_iov(iov, wdata);
+ cifs_kmap_unlock();
cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
@@ -4887,7 +4790,7 @@ getDFSRetry:
/* server pointer checked in called function,
but should never be null here anyway */
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
pSMB->hdr.Tid = ses->ipc_tid;
pSMB->hdr.Uid = ses->Suid;
if (ses->capabilities & CAP_STATUS32)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 981cc62a34d..94b7788c318 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
/*
* fs/cifs/connect.c
*
- * Copyright (C) International Business Machines Corp., 2002,2009
+ * Copyright (C) International Business Machines Corp., 2002,2011
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -102,7 +102,7 @@ enum {
Opt_srcaddr, Opt_prefixpath,
Opt_iocharset, Opt_sockopt,
Opt_netbiosname, Opt_servern,
- Opt_ver, Opt_sec,
+ Opt_ver, Opt_vers, Opt_sec, Opt_cache,
/* Mount options to be ignored */
Opt_ignore,
@@ -210,9 +210,9 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_netbiosname, "netbiosname=%s" },
{ Opt_servern, "servern=%s" },
{ Opt_ver, "ver=%s" },
- { Opt_ver, "vers=%s" },
- { Opt_ver, "version=%s" },
+ { Opt_vers, "vers=%s" },
{ Opt_sec, "sec=%s" },
+ { Opt_cache, "cache=%s" },
{ Opt_ignore, "cred" },
{ Opt_ignore, "credentials" },
@@ -261,6 +261,26 @@ static const match_table_t cifs_secflavor_tokens = {
{ Opt_sec_err, NULL }
};
+/* cache flavors */
+enum {
+ Opt_cache_loose,
+ Opt_cache_strict,
+ Opt_cache_none,
+ Opt_cache_err
+};
+
+static const match_table_t cifs_cacheflavor_tokens = {
+ { Opt_cache_loose, "loose" },
+ { Opt_cache_strict, "strict" },
+ { Opt_cache_none, "none" },
+ { Opt_cache_err, NULL }
+};
+
+static const match_table_t cifs_smb_version_tokens = {
+ { Smb_1, SMB1_VERSION_STRING },
+ { Smb_21, SMB21_VERSION_STRING },
+};
+
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -549,7 +569,7 @@ allocate_buffers(struct TCP_Server_Info *server)
}
} else if (server->large_buf) {
/* we are reusing a dirty large buf, clear its start */
- memset(server->bigbuf, 0, header_size());
+ memset(server->bigbuf, 0, HEADER_SIZE(server));
}
if (!server->smallbuf) {
@@ -563,7 +583,7 @@ allocate_buffers(struct TCP_Server_Info *server)
/* beginning of smb buffer is cleared in our buf_get */
} else {
/* if existing small buf clear beginning */
- memset(server->smallbuf, 0, header_size());
+ memset(server->smallbuf, 0, HEADER_SIZE(server));
}
return true;
@@ -764,25 +784,6 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
return false;
}
-static struct mid_q_entry *
-find_mid(struct TCP_Server_Info *server, char *buffer)
-{
- struct smb_hdr *buf = (struct smb_hdr *)buffer;
- struct mid_q_entry *mid;
-
- spin_lock(&GlobalMid_Lock);
- list_for_each_entry(mid, &server->pending_mid_q, qhead) {
- if (mid->mid == buf->Mid &&
- mid->mid_state == MID_REQUEST_SUBMITTED &&
- le16_to_cpu(mid->command) == buf->Command) {
- spin_unlock(&GlobalMid_Lock);
- return mid;
- }
- }
- spin_unlock(&GlobalMid_Lock);
- return NULL;
-}
-
void
dequeue_mid(struct mid_q_entry *mid, bool malformed)
{
@@ -934,7 +935,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
unsigned int pdu_length = get_rfc1002_length(buf);
/* make sure this will fit in a large buffer */
- if (pdu_length > CIFSMaxBufSize + max_header_size() - 4) {
+ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
cERROR(1, "SMB response too long (%u bytes)",
pdu_length);
cifs_reconnect(server);
@@ -950,8 +951,8 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
}
/* now read the rest */
- length = cifs_read_from_socket(server, buf + header_size() - 1,
- pdu_length - header_size() + 1 + 4);
+ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
+ pdu_length - HEADER_SIZE(server) + 1 + 4);
if (length < 0)
return length;
server->total_read += length;
@@ -967,7 +968,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* 48 bytes is enough to display the header and a little bit
* into the payload for debugging purposes.
*/
- length = checkSMB(buf, server->total_read);
+ length = server->ops->check_message(buf, server->total_read);
if (length != 0)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
@@ -1025,7 +1026,7 @@ cifs_demultiplex_thread(void *p)
continue;
/* make sure we have enough to get to the MID */
- if (pdu_length < header_size() - 1 - 4) {
+ if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
cERROR(1, "SMB response too short (%u bytes)",
pdu_length);
cifs_reconnect(server);
@@ -1035,12 +1036,12 @@ cifs_demultiplex_thread(void *p)
/* read down to the MID */
length = cifs_read_from_socket(server, buf + 4,
- header_size() - 1 - 4);
+ HEADER_SIZE(server) - 1 - 4);
if (length < 0)
continue;
server->total_read += length;
- mid_entry = find_mid(server, buf);
+ mid_entry = server->ops->find_mid(server, buf);
if (!mid_entry || !mid_entry->receive)
length = standard_receive3(server, mid_entry);
@@ -1057,12 +1058,15 @@ cifs_demultiplex_thread(void *p)
if (mid_entry != NULL) {
if (!mid_entry->multiRsp || mid_entry->multiEnd)
mid_entry->callback(mid_entry);
- } else if (!is_valid_oplock_break(buf, server)) {
+ } else if (!server->ops->is_oplock_break ||
+ !server->ops->is_oplock_break(buf, server)) {
cERROR(1, "No task to wake, unknown frame received! "
"NumMids %d", atomic_read(&midCount));
- cifs_dump_mem("Received Data is: ", buf, header_size());
+ cifs_dump_mem("Received Data is: ", buf,
+ HEADER_SIZE(server));
#ifdef CONFIG_CIFS_DEBUG2
- cifs_dump_detail(buf);
+ if (server->ops->dump_detail)
+ server->ops->dump_detail(buf);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
@@ -1186,6 +1190,54 @@ static int cifs_parse_security_flavors(char *value,
}
static int
+cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, cifs_cacheflavor_tokens, args)) {
+ case Opt_cache_loose:
+ vol->direct_io = false;
+ vol->strict_io = false;
+ break;
+ case Opt_cache_strict:
+ vol->direct_io = false;
+ vol->strict_io = true;
+ break;
+ case Opt_cache_none:
+ vol->direct_io = true;
+ vol->strict_io = false;
+ break;
+ default:
+ cERROR(1, "bad cache= option: %s", value);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+cifs_parse_smb_version(char *value, struct smb_vol *vol)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, cifs_smb_version_tokens, args)) {
+ case Smb_1:
+ vol->ops = &smb1_operations;
+ vol->vals = &smb1_values;
+ break;
+#ifdef CONFIG_CIFS_SMB2
+ case Smb_21:
+ vol->ops = &smb21_operations;
+ vol->vals = &smb21_values;
+ break;
+#endif
+ default:
+ cERROR(1, "Unknown vers= option specified: %s", value);
+ return 1;
+ }
+ return 0;
+}
+
+static int
cifs_parse_mount_options(const char *mountdata, const char *devname,
struct smb_vol *vol)
{
@@ -1203,6 +1255,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
char *string = NULL;
char *tmp_end, *value;
char delim;
+ bool cache_specified = false;
+ static bool cache_warned = false;
separator[0] = ',';
separator[1] = 0;
@@ -1236,6 +1290,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->actimeo = CIFS_DEF_ACTIMEO;
+ /* FIXME: add autonegotiation -- for now, SMB1 is default */
+ vol->ops = &smb1_operations;
+ vol->vals = &smb1_values;
+
if (!mountdata)
goto cifs_parse_mount_err;
@@ -1414,10 +1472,20 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->seal = 1;
break;
case Opt_direct:
- vol->direct_io = 1;
+ cache_specified = true;
+ vol->direct_io = true;
+ vol->strict_io = false;
+ cERROR(1, "The \"directio\" option will be removed in "
+ "3.7. Please switch to the \"cache=none\" "
+ "option.");
break;
case Opt_strictcache:
- vol->strict_io = 1;
+ cache_specified = true;
+ vol->direct_io = false;
+ vol->strict_io = true;
+ cERROR(1, "The \"strictcache\" option will be removed "
+ "in 3.7. Please switch to the \"cache=strict\" "
+ "option.");
break;
case Opt_noac:
printk(KERN_WARNING "CIFS: Mount option noac not "
@@ -1823,8 +1891,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (string == NULL)
goto out_nomem;
- if (strnicmp(string, "cifs", 4) == 0 ||
- strnicmp(string, "1", 1) == 0) {
+ if (strnicmp(string, "1", 1) == 0) {
/* This is the default */
break;
}
@@ -1832,6 +1899,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
printk(KERN_WARNING "CIFS: Invalid version"
" specified\n");
goto cifs_parse_mount_err;
+ case Opt_vers:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (cifs_parse_smb_version(string, vol) != 0)
+ goto cifs_parse_mount_err;
+ break;
case Opt_sec:
string = match_strdup(args);
if (string == NULL)
@@ -1840,6 +1915,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
if (cifs_parse_security_flavors(string, vol) != 0)
goto cifs_parse_mount_err;
break;
+ case Opt_cache:
+ cache_specified = true;
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+
+ if (cifs_parse_cache_flavor(string, vol) != 0)
+ goto cifs_parse_mount_err;
+ break;
default:
/*
* An option we don't recognize. Save it off for later
@@ -1883,6 +1967,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
"specified with no gid= option.\n");
+ /* FIXME: remove this block in 3.7 */
+ if (!cache_specified && !cache_warned) {
+ cache_warned = true;
+ printk(KERN_NOTICE "CIFS: no cache= option specified, using "
+ "\"cache=loose\". This default will change "
+ "to \"cache=strict\" in 3.7.\n");
+ }
+
kfree(mountdata_copy);
return 0;
@@ -2043,6 +2135,9 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
struct smb_vol *vol)
{
+ if ((server->vals != vol->vals) || (server->ops != vol->ops))
+ return 0;
+
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
@@ -2165,6 +2260,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
goto out_err;
}
+ tcp_ses->ops = volume_info->ops;
+ tcp_ses->vals = volume_info->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
tcp_ses->hostname = extract_hostname(volume_info->UNC);
if (IS_ERR(tcp_ses->hostname)) {
@@ -3348,6 +3445,18 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
+/*
+ * On hosts with high memory, we can't currently support wsize/rsize that are
+ * larger than we can kmap at once. Cap the rsize/wsize at
+ * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request
+ * larger than that anyway.
+ */
+#ifdef CONFIG_HIGHMEM
+#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE)
+#else /* CONFIG_HIGHMEM */
+#define CIFS_KMAP_SIZE_LIMIT (1<<24)
+#endif /* CONFIG_HIGHMEM */
+
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
{
@@ -3378,6 +3487,9 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
wsize = min_t(unsigned int, wsize,
server->maxBuf - sizeof(WRITE_REQ) + 4);
+ /* limit to the amount that we can kmap at once */
+ wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT);
+
/* hard limit of CIFS_MAX_WSIZE */
wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE);
@@ -3398,18 +3510,15 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
* MS-CIFS indicates that servers are only limited by the client's
* bufsize for reads, testing against win98se shows that it throws
* INVALID_PARAMETER errors if you try to request too large a read.
+ * OS/2 just sends back short reads.
*
- * If the server advertises a MaxBufferSize of less than one page,
- * assume that it also can't satisfy reads larger than that either.
- *
- * FIXME: Is there a better heuristic for this?
+ * If the server doesn't advertise CAP_LARGE_READ_X, then assume that
+ * it can't handle a read request larger than its MaxBufferSize either.
*/
if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
defsize = CIFS_DEFAULT_IOSIZE;
else if (server->capabilities & CAP_LARGE_READ_X)
defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
- else if (server->maxBuf >= PAGE_CACHE_SIZE)
- defsize = CIFSMaxBufSize;
else
defsize = server->maxBuf - sizeof(READ_RSP);
@@ -3422,6 +3531,9 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
if (!(server->capabilities & CAP_LARGE_READ_X))
rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+ /* limit to the amount that we can kmap at once */
+ rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT);
+
/* hard limit of CIFS_MAX_RSIZE */
rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
@@ -3571,6 +3683,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
if (cifs_parse_mount_options(mount_data, devname, volume_info))
return -EINVAL;
+
if (volume_info->nullauth) {
cFYI(1, "Anonymous login");
kfree(volume_info->username);
@@ -3844,7 +3957,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
NULL /*no tid */ , 4 /*wct */ );
- smb_buffer->Mid = GetNextMid(ses->server);
+ smb_buffer->Mid = get_next_mid(ses->server);
smb_buffer->Uid = ses->Suid;
pSMB = (TCONX_REQ *) smb_buffer;
pSMBr = (TCONX_RSP *) smb_buffer_response;
@@ -4012,11 +4125,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
if (server->maxBuf != 0)
return 0;
- cifs_set_credits(server, 1);
+ set_credits(server, 1);
rc = CIFSSMBNegotiate(xid, ses);
if (rc == -EAGAIN) {
/* retry only once on 1st time connection */
- cifs_set_credits(server, 1);
+ set_credits(server, 1);
rc = CIFSSMBNegotiate(xid, ses);
if (rc == -EAGAIN)
rc = -EHOSTDOWN;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e7ebb5a2ed1..513adbc211d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -264,6 +264,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
pCifsFile->tlink = cifs_get_tlink(tlink);
mutex_init(&pCifsFile->fh_mutex);
INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
+ INIT_LIST_HEAD(&pCifsFile->llist);
spin_lock(&cifs_file_list_lock);
list_add(&pCifsFile->tlist, &(tlink_tcon(tlink)->openFileList));
@@ -334,9 +335,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
* is closed anyway.
*/
mutex_lock(&cifsi->lock_mutex);
- list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
- if (li->netfid != cifs_file->netfid)
- continue;
+ list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
list_del(&li->llist);
cifs_del_lock_waiters(li);
kfree(li);
@@ -645,7 +644,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
}
static struct cifsLockInfo *
-cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type)
{
struct cifsLockInfo *lock =
kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -654,7 +653,6 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
lock->offset = offset;
lock->length = length;
lock->type = type;
- lock->netfid = netfid;
lock->pid = current->tgid;
INIT_LIST_HEAD(&lock->blist);
init_waitqueue_head(&lock->block_q);
@@ -672,19 +670,20 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
}
static bool
-__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
- __u64 length, __u8 type, __u16 netfid,
- struct cifsLockInfo **conf_lock)
+cifs_find_fid_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
+ __u64 length, __u8 type, struct cifsFileInfo *cur,
+ struct cifsLockInfo **conf_lock)
{
- struct cifsLockInfo *li, *tmp;
+ struct cifsLockInfo *li;
+ struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
- list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ list_for_each_entry(li, &cfile->llist, llist) {
if (offset + length <= li->offset ||
offset >= li->offset + li->length)
continue;
- else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
- ((netfid == li->netfid && current->tgid == li->pid) ||
- type == li->type))
+ else if ((type & server->vals->shared_lock_type) &&
+ ((server->ops->compare_fids(cur, cfile) &&
+ current->tgid == li->pid) || type == li->type))
continue;
else {
*conf_lock = li;
@@ -695,11 +694,23 @@ __cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
}
static bool
-cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
- struct cifsLockInfo **conf_lock)
+cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+ __u8 type, struct cifsLockInfo **conf_lock)
{
- return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
- lock->type, lock->netfid, conf_lock);
+ bool rc = false;
+ struct cifsFileInfo *fid, *tmp;
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+
+ spin_lock(&cifs_file_list_lock);
+ list_for_each_entry_safe(fid, tmp, &cinode->openFileList, flist) {
+ rc = cifs_find_fid_lock_conflict(fid, offset, length, type,
+ cfile, conf_lock);
+ if (rc)
+ break;
+ }
+ spin_unlock(&cifs_file_list_lock);
+
+ return rc;
}
/*
@@ -710,22 +721,24 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
* the server or 1 otherwise.
*/
static int
-cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
- __u8 type, __u16 netfid, struct file_lock *flock)
+cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
+ __u8 type, struct file_lock *flock)
{
int rc = 0;
struct cifsLockInfo *conf_lock;
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
bool exist;
mutex_lock(&cinode->lock_mutex);
- exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
- &conf_lock);
+ exist = cifs_find_lock_conflict(cfile, offset, length, type,
+ &conf_lock);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
flock->fl_pid = conf_lock->pid;
- if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+ if (conf_lock->type & server->vals->shared_lock_type)
flock->fl_type = F_RDLCK;
else
flock->fl_type = F_WRLCK;
@@ -739,10 +752,11 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
}
static void
-cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
+cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
mutex_lock(&cinode->lock_mutex);
- list_add_tail(&lock->llist, &cinode->llist);
+ list_add_tail(&lock->llist, &cfile->llist);
mutex_unlock(&cinode->lock_mutex);
}
@@ -753,10 +767,11 @@ cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
* 3) -EACCESS, if there is a lock that prevents us and wait is false.
*/
static int
-cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
+cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
bool wait)
{
struct cifsLockInfo *conf_lock;
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
bool exist;
int rc = 0;
@@ -764,9 +779,10 @@ try_again:
exist = false;
mutex_lock(&cinode->lock_mutex);
- exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
+ exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
+ lock->type, &conf_lock);
if (!exist && cinode->can_cache_brlcks) {
- list_add_tail(&lock->llist, &cinode->llist);
+ list_add_tail(&lock->llist, &cfile->llist);
mutex_unlock(&cinode->lock_mutex);
return rc;
}
@@ -860,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
struct cifsLockInfo *li, *tmp;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
- unsigned int num, max_num;
+ unsigned int num, max_num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
int types[] = {LOCKING_ANDX_LARGE_FILES,
LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
@@ -876,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
return rc;
}
- max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
- sizeof(LOCKING_ANDX_RANGE);
+ /*
+ * Accessing maxBuf is racy with cifs_reconnect - need to store value
+ * and check it for zero before using.
+ */
+ max_buf = tcon->ses->server->maxBuf;
+ if (!max_buf) {
+ mutex_unlock(&cinode->lock_mutex);
+ FreeXid(xid);
+ return -EINVAL;
+ }
+
+ max_num = (max_buf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
mutex_unlock(&cinode->lock_mutex);
@@ -888,7 +915,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
- list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
if (li->type != types[i])
continue;
cur->Pid = cpu_to_le16(li->pid);
@@ -898,7 +925,8 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
if (++num == max_num) {
stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
- li->type, 0, num, buf);
+ (__u8)li->type, 0, num,
+ buf);
if (stored_rc)
rc = stored_rc;
cur = buf;
@@ -909,7 +937,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
if (num) {
stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
- types[i], 0, num, buf);
+ (__u8)types[i], 0, num, buf);
if (stored_rc)
rc = stored_rc;
}
@@ -1053,8 +1081,8 @@ cifs_push_locks(struct cifsFileInfo *cfile)
}
static void
-cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
- bool *wait_flag)
+cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
+ bool *wait_flag, struct TCP_Server_Info *server)
{
if (flock->fl_flags & FL_POSIX)
cFYI(1, "Posix");
@@ -1073,38 +1101,50 @@ cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
(~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
- *type = LOCKING_ANDX_LARGE_FILES;
+ *type = server->vals->large_lock_type;
if (flock->fl_type == F_WRLCK) {
cFYI(1, "F_WRLCK ");
+ *type |= server->vals->exclusive_lock_type;
*lock = 1;
} else if (flock->fl_type == F_UNLCK) {
cFYI(1, "F_UNLCK");
+ *type |= server->vals->unlock_lock_type;
*unlock = 1;
/* Check if unlock includes more than one lock range */
} else if (flock->fl_type == F_RDLCK) {
cFYI(1, "F_RDLCK");
- *type |= LOCKING_ANDX_SHARED_LOCK;
+ *type |= server->vals->shared_lock_type;
*lock = 1;
} else if (flock->fl_type == F_EXLCK) {
cFYI(1, "F_EXLCK");
+ *type |= server->vals->exclusive_lock_type;
*lock = 1;
} else if (flock->fl_type == F_SHLCK) {
cFYI(1, "F_SHLCK");
- *type |= LOCKING_ANDX_SHARED_LOCK;
+ *type |= server->vals->shared_lock_type;
*lock = 1;
} else
cFYI(1, "Unknown type of lock");
}
static int
-cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
+cifs_mandatory_lock(int xid, struct cifsFileInfo *cfile, __u64 offset,
+ __u64 length, __u32 type, int lock, int unlock, bool wait)
+{
+ return CIFSSMBLock(xid, tlink_tcon(cfile->tlink), cfile->netfid,
+ current->tgid, length, offset, unlock, lock,
+ (__u8)type, wait, 0);
+}
+
+static int
+cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
bool wait_flag, bool posix_lck, int xid)
{
int rc = 0;
__u64 length = 1 + flock->fl_end - flock->fl_start;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct TCP_Server_Info *server = tcon->ses->server;
__u16 netfid = cfile->netfid;
if (posix_lck) {
@@ -1114,7 +1154,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
if (!rc)
return rc;
- if (type & LOCKING_ANDX_SHARED_LOCK)
+ if (type & server->vals->shared_lock_type)
posix_lock_type = CIFS_RDLCK;
else
posix_lock_type = CIFS_WRLCK;
@@ -1124,38 +1164,35 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
return rc;
}
- rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
- flock);
+ rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
if (!rc)
return rc;
/* BB we could chain these into one lock request BB */
- rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
- flock->fl_start, 0, 1, type, 0, 0);
+ rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length, type,
+ 1, 0, false);
if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
- length, flock->fl_start, 1, 0,
- type, 0, 0);
+ rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+ type, 0, 1, false);
flock->fl_type = F_UNLCK;
if (rc != 0)
cERROR(1, "Error unlocking previously locked "
- "range %d during test of lock", rc);
+ "range %d during test of lock", rc);
return 0;
}
- if (type & LOCKING_ANDX_SHARED_LOCK) {
+ if (type & server->vals->shared_lock_type) {
flock->fl_type = F_WRLCK;
return 0;
}
- rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
- flock->fl_start, 0, 1,
- type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
+ rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+ type | server->vals->shared_lock_type, 1, 0,
+ false);
if (rc == 0) {
- rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
- length, flock->fl_start, 1, 0,
- type | LOCKING_ANDX_SHARED_LOCK,
- 0, 0);
+ rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+ type | server->vals->shared_lock_type,
+ 0, 1, false);
flock->fl_type = F_RDLCK;
if (rc != 0)
cERROR(1, "Error unlocking previously locked "
@@ -1192,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
int types[] = {LOCKING_ANDX_LARGE_FILES,
LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
unsigned int i;
- unsigned int max_num, num;
+ unsigned int max_num, num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
@@ -1202,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
INIT_LIST_HEAD(&tmp_llist);
- max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
- sizeof(LOCKING_ANDX_RANGE);
+ /*
+ * Accessing maxBuf is racy with cifs_reconnect - need to store value
+ * and check it for zero before using.
+ */
+ max_buf = tcon->ses->server->maxBuf;
+ if (!max_buf)
+ return -EINVAL;
+
+ max_num = (max_buf - sizeof(struct smb_hdr)) /
+ sizeof(LOCKING_ANDX_RANGE);
buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1212,71 +1257,64 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
- list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+ list_for_each_entry_safe(li, tmp, &cfile->llist, llist) {
if (flock->fl_start > li->offset ||
(flock->fl_start + length) <
(li->offset + li->length))
continue;
if (current->tgid != li->pid)
continue;
- if (cfile->netfid != li->netfid)
- continue;
if (types[i] != li->type)
continue;
- if (!cinode->can_cache_brlcks) {
- cur->Pid = cpu_to_le16(li->pid);
- cur->LengthLow = cpu_to_le32((u32)li->length);
- cur->LengthHigh =
- cpu_to_le32((u32)(li->length>>32));
- cur->OffsetLow = cpu_to_le32((u32)li->offset);
- cur->OffsetHigh =
- cpu_to_le32((u32)(li->offset>>32));
- /*
- * We need to save a lock here to let us add
- * it again to the inode list if the unlock
- * range request fails on the server.
- */
- list_move(&li->llist, &tmp_llist);
- if (++num == max_num) {
- stored_rc = cifs_lockv(xid, tcon,
- cfile->netfid,
- li->type, num,
- 0, buf);
- if (stored_rc) {
- /*
- * We failed on the unlock range
- * request - add all locks from
- * the tmp list to the head of
- * the inode list.
- */
- cifs_move_llist(&tmp_llist,
- &cinode->llist);
- rc = stored_rc;
- } else
- /*
- * The unlock range request
- * succeed - free the tmp list.
- */
- cifs_free_llist(&tmp_llist);
- cur = buf;
- num = 0;
- } else
- cur++;
- } else {
+ if (cinode->can_cache_brlcks) {
/*
* We can cache brlock requests - simply remove
- * a lock from the inode list.
+ * a lock from the file's list.
*/
list_del(&li->llist);
cifs_del_lock_waiters(li);
kfree(li);
+ continue;
}
+ cur->Pid = cpu_to_le16(li->pid);
+ cur->LengthLow = cpu_to_le32((u32)li->length);
+ cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+ cur->OffsetLow = cpu_to_le32((u32)li->offset);
+ cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+ /*
+ * We need to save a lock here to let us add it again to
+ * the file's list if the unlock range request fails on
+ * the server.
+ */
+ list_move(&li->llist, &tmp_llist);
+ if (++num == max_num) {
+ stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+ li->type, num, 0, buf);
+ if (stored_rc) {
+ /*
+ * We failed on the unlock range
+ * request - add all locks from the tmp
+ * list to the head of the file's list.
+ */
+ cifs_move_llist(&tmp_llist,
+ &cfile->llist);
+ rc = stored_rc;
+ } else
+ /*
+ * The unlock range request succeed -
+ * free the tmp list.
+ */
+ cifs_free_llist(&tmp_llist);
+ cur = buf;
+ num = 0;
+ } else
+ cur++;
}
if (num) {
stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
types[i], num, 0, buf);
if (stored_rc) {
- cifs_move_llist(&tmp_llist, &cinode->llist);
+ cifs_move_llist(&tmp_llist, &cfile->llist);
rc = stored_rc;
} else
cifs_free_llist(&tmp_llist);
@@ -1289,14 +1327,14 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
}
static int
-cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
+cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
{
int rc = 0;
__u64 length = 1 + flock->fl_end - flock->fl_start;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+ struct TCP_Server_Info *server = tcon->ses->server;
__u16 netfid = cfile->netfid;
if (posix_lck) {
@@ -1306,7 +1344,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
if (!rc || rc < 0)
return rc;
- if (type & LOCKING_ANDX_SHARED_LOCK)
+ if (type & server->vals->shared_lock_type)
posix_lock_type = CIFS_RDLCK;
else
posix_lock_type = CIFS_WRLCK;
@@ -1323,24 +1361,24 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
if (lock) {
struct cifsLockInfo *lock;
- lock = cifs_lock_init(flock->fl_start, length, type, netfid);
+ lock = cifs_lock_init(flock->fl_start, length, type);
if (!lock)
return -ENOMEM;
- rc = cifs_lock_add_if(cinode, lock, wait_flag);
+ rc = cifs_lock_add_if(cfile, lock, wait_flag);
if (rc < 0)
kfree(lock);
if (rc <= 0)
goto out;
- rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
- flock->fl_start, 0, 1, type, wait_flag, 0);
+ rc = cifs_mandatory_lock(xid, cfile, flock->fl_start, length,
+ type, 1, 0, wait_flag);
if (rc) {
kfree(lock);
goto out;
}
- cifs_lock_add(cinode, lock);
+ cifs_lock_add(cfile, lock);
} else if (unlock)
rc = cifs_unlock_range(cfile, flock, xid);
@@ -1361,7 +1399,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
struct cifsInodeInfo *cinode;
struct cifsFileInfo *cfile;
__u16 netfid;
- __u8 type;
+ __u32 type;
rc = -EACCES;
xid = GetXid();
@@ -1370,11 +1408,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
"end: %lld", cmd, flock->fl_flags, flock->fl_type,
flock->fl_start, flock->fl_end);
- cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
-
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
+
+ cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
+ tcon->ses->server);
+
+ cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
netfid = cfile->netfid;
cinode = CIFS_I(file->f_path.dentry->d_inode);
@@ -2348,24 +2388,224 @@ ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
return cifs_user_writev(iocb, iov, nr_segs, pos);
}
+static struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_vecs, work_func_t complete)
+{
+ struct cifs_readdata *rdata;
+
+ rdata = kzalloc(sizeof(*rdata) +
+ sizeof(struct kvec) * nr_vecs, GFP_KERNEL);
+ if (rdata != NULL) {
+ kref_init(&rdata->refcount);
+ INIT_LIST_HEAD(&rdata->list);
+ init_completion(&rdata->done);
+ INIT_WORK(&rdata->work, complete);
+ INIT_LIST_HEAD(&rdata->pages);
+ }
+ return rdata;
+}
+
+void
+cifs_readdata_release(struct kref *refcount)
+{
+ struct cifs_readdata *rdata = container_of(refcount,
+ struct cifs_readdata, refcount);
+
+ if (rdata->cfile)
+ cifsFileInfo_put(rdata->cfile);
+
+ kfree(rdata);
+}
+
+static int
+cifs_read_allocate_pages(struct list_head *list, unsigned int npages)
+{
+ int rc = 0;
+ struct page *page, *tpage;
+ unsigned int i;
+
+ for (i = 0; i < npages; i++) {
+ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ if (!page) {
+ rc = -ENOMEM;
+ break;
+ }
+ list_add(&page->lru, list);
+ }
+
+ if (rc) {
+ list_for_each_entry_safe(page, tpage, list, lru) {
+ list_del(&page->lru);
+ put_page(page);
+ }
+ }
+ return rc;
+}
+
+static void
+cifs_uncached_readdata_release(struct kref *refcount)
+{
+ struct page *page, *tpage;
+ struct cifs_readdata *rdata = container_of(refcount,
+ struct cifs_readdata, refcount);
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ list_del(&page->lru);
+ put_page(page);
+ }
+ cifs_readdata_release(refcount);
+}
+
+static int
+cifs_retry_async_readv(struct cifs_readdata *rdata)
+{
+ int rc;
+
+ do {
+ if (rdata->cfile->invalidHandle) {
+ rc = cifs_reopen_file(rdata->cfile, true);
+ if (rc != 0)
+ continue;
+ }
+ rc = cifs_async_readv(rdata);
+ } while (rc == -EAGAIN);
+
+ return rc;
+}
+
+/**
+ * cifs_readdata_to_iov - copy data from pages in response to an iovec
+ * @rdata: the readdata response with list of pages holding data
+ * @iov: vector in which we should copy the data
+ * @nr_segs: number of segments in vector
+ * @offset: offset into file of the first iovec
+ * @copied: used to return the amount of data copied to the iov
+ *
+ * This function copies data from a list of pages in a readdata response into
+ * an array of iovecs. It will first calculate where the data should go
+ * based on the info in the readdata and then copy the data into that spot.
+ */
+static ssize_t
+cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
+ unsigned long nr_segs, loff_t offset, ssize_t *copied)
+{
+ int rc = 0;
+ struct iov_iter ii;
+ size_t pos = rdata->offset - offset;
+ struct page *page, *tpage;
+ ssize_t remaining = rdata->bytes;
+ unsigned char *pdata;
+
+ /* set up iov_iter and advance to the correct offset */
+ iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
+ iov_iter_advance(&ii, pos);
+
+ *copied = 0;
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ ssize_t copy;
+
+ /* copy a whole page or whatever's left */
+ copy = min_t(ssize_t, remaining, PAGE_SIZE);
+
+ /* ...but limit it to whatever space is left in the iov */
+ copy = min_t(ssize_t, copy, iov_iter_count(&ii));
+
+ /* go while there's data to be copied and no errors */
+ if (copy && !rc) {
+ pdata = kmap(page);
+ rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
+ (int)copy);
+ kunmap(page);
+ if (!rc) {
+ *copied += copy;
+ remaining -= copy;
+ iov_iter_advance(&ii, copy);
+ }
+ }
+
+ list_del(&page->lru);
+ put_page(page);
+ }
+
+ return rc;
+}
+
+static void
+cifs_uncached_readv_complete(struct work_struct *work)
+{
+ struct cifs_readdata *rdata = container_of(work,
+ struct cifs_readdata, work);
+
+ /* if the result is non-zero then the pages weren't kmapped */
+ if (rdata->result == 0) {
+ struct page *page;
+
+ list_for_each_entry(page, &rdata->pages, lru)
+ kunmap(page);
+ }
+
+ complete(&rdata->done);
+ kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+}
+
+static int
+cifs_uncached_read_marshal_iov(struct cifs_readdata *rdata,
+ unsigned int remaining)
+{
+ int len = 0;
+ struct page *page, *tpage;
+
+ rdata->nr_iov = 1;
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ if (remaining >= PAGE_SIZE) {
+ /* enough data to fill the page */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = PAGE_SIZE;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ ++rdata->nr_iov;
+ len += PAGE_SIZE;
+ remaining -= PAGE_SIZE;
+ } else if (remaining > 0) {
+ /* enough for partial page, fill and zero the rest */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = remaining;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+ '\0', PAGE_SIZE - remaining);
+ ++rdata->nr_iov;
+ len += remaining;
+ remaining = 0;
+ } else {
+ /* no need to hold page hostage */
+ list_del(&page->lru);
+ put_page(page);
+ }
+ }
+
+ return len;
+}
+
static ssize_t
cifs_iovec_read(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *poffset)
{
- int rc;
- int xid;
- ssize_t total_read;
- unsigned int bytes_read = 0;
+ ssize_t rc;
size_t len, cur_len;
- int iov_offset = 0;
+ ssize_t total_read = 0;
+ loff_t offset = *poffset;
+ unsigned int npages;
struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *pTcon;
+ struct cifs_tcon *tcon;
struct cifsFileInfo *open_file;
- struct smb_com_read_rsp *pSMBr;
- struct cifs_io_parms io_parms;
- char *read_data;
- unsigned int rsize;
- __u32 pid;
+ struct cifs_readdata *rdata, *tmp;
+ struct list_head rdata_list;
+ pid_t pid;
if (!nr_segs)
return 0;
@@ -2374,14 +2614,10 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
if (!len)
return 0;
- xid = GetXid();
+ INIT_LIST_HEAD(&rdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
- /* FIXME: set up handlers for larger reads and/or convert to async */
- rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
-
open_file = file->private_data;
- pTcon = tlink_tcon(open_file->tlink);
+ tcon = tlink_tcon(open_file->tlink);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -2391,56 +2627,78 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
cFYI(1, "attempting read on write only file instance");
- for (total_read = 0; total_read < len; total_read += bytes_read) {
- cur_len = min_t(const size_t, len - total_read, rsize);
- rc = -EAGAIN;
- read_data = NULL;
+ do {
+ cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+ npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
- while (rc == -EAGAIN) {
- int buf_type = CIFS_NO_BUFFER;
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc != 0)
- break;
- }
- io_parms.netfid = open_file->netfid;
- io_parms.pid = pid;
- io_parms.tcon = pTcon;
- io_parms.offset = *poffset;
- io_parms.length = cur_len;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
- &read_data, &buf_type);
- pSMBr = (struct smb_com_read_rsp *)read_data;
- if (read_data) {
- char *data_offset = read_data + 4 +
- le16_to_cpu(pSMBr->DataOffset);
- if (memcpy_toiovecend(iov, data_offset,
- iov_offset, bytes_read))
- rc = -EFAULT;
- if (buf_type == CIFS_SMALL_BUFFER)
- cifs_small_buf_release(read_data);
- else if (buf_type == CIFS_LARGE_BUFFER)
- cifs_buf_release(read_data);
- read_data = NULL;
- iov_offset += bytes_read;
- }
+ /* allocate a readdata struct */
+ rdata = cifs_readdata_alloc(npages,
+ cifs_uncached_readv_complete);
+ if (!rdata) {
+ rc = -ENOMEM;
+ goto error;
}
- if (rc || (bytes_read == 0)) {
- if (total_read) {
- break;
- } else {
- FreeXid(xid);
- return rc;
+ rc = cifs_read_allocate_pages(&rdata->pages, npages);
+ if (rc)
+ goto error;
+
+ rdata->cfile = cifsFileInfo_get(open_file);
+ rdata->offset = offset;
+ rdata->bytes = cur_len;
+ rdata->pid = pid;
+ rdata->marshal_iov = cifs_uncached_read_marshal_iov;
+
+ rc = cifs_retry_async_readv(rdata);
+error:
+ if (rc) {
+ kref_put(&rdata->refcount,
+ cifs_uncached_readdata_release);
+ break;
+ }
+
+ list_add_tail(&rdata->list, &rdata_list);
+ offset += cur_len;
+ len -= cur_len;
+ } while (len > 0);
+
+ /* if at least one read request send succeeded, then reset rc */
+ if (!list_empty(&rdata_list))
+ rc = 0;
+
+ /* the loop below should proceed in the order of increasing offsets */
+restart_loop:
+ list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ if (!rc) {
+ ssize_t copied;
+
+ /* FIXME: freezable sleep too? */
+ rc = wait_for_completion_killable(&rdata->done);
+ if (rc)
+ rc = -EINTR;
+ else if (rdata->result)
+ rc = rdata->result;
+ else {
+ rc = cifs_readdata_to_iov(rdata, iov,
+ nr_segs, *poffset,
+ &copied);
+ total_read += copied;
+ }
+
+ /* resend call if it's a retryable error */
+ if (rc == -EAGAIN) {
+ rc = cifs_retry_async_readv(rdata);
+ goto restart_loop;
}
- } else {
- cifs_stats_bytes_read(pTcon, bytes_read);
- *poffset += bytes_read;
}
+ list_del_init(&rdata->list);
+ kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- FreeXid(xid);
- return total_read;
+ cifs_stats_bytes_read(tcon, total_read);
+ *poffset += total_read;
+
+ return total_read ? total_read : rc;
}
ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
@@ -2615,6 +2873,100 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+ struct cifs_readdata *rdata = container_of(work,
+ struct cifs_readdata, work);
+ struct page *page, *tpage;
+
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+
+ if (rdata->result == 0) {
+ kunmap(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ unlock_page(page);
+
+ if (rdata->result == 0)
+ cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+ page_cache_release(page);
+ }
+ kref_put(&rdata->refcount, cifs_readdata_release);
+}
+
+static int
+cifs_readpages_marshal_iov(struct cifs_readdata *rdata, unsigned int remaining)
+{
+ int len = 0;
+ struct page *page, *tpage;
+ u64 eof;
+ pgoff_t eof_index;
+
+ /* determine the eof that the server (probably) has */
+ eof = CIFS_I(rdata->mapping->host)->server_eof;
+ eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+ cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+ rdata->nr_iov = 1;
+ list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+ if (remaining >= PAGE_CACHE_SIZE) {
+ /* enough data to fill the page */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ ++rdata->nr_iov;
+ len += PAGE_CACHE_SIZE;
+ remaining -= PAGE_CACHE_SIZE;
+ } else if (remaining > 0) {
+ /* enough for partial page, fill and zero the rest */
+ rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+ rdata->iov[rdata->nr_iov].iov_len = remaining;
+ cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+ rdata->nr_iov, page->index,
+ rdata->iov[rdata->nr_iov].iov_base,
+ rdata->iov[rdata->nr_iov].iov_len);
+ memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+ '\0', PAGE_CACHE_SIZE - remaining);
+ ++rdata->nr_iov;
+ len += remaining;
+ remaining = 0;
+ } else if (page->index > eof_index) {
+ /*
+ * The VFS will not try to do readahead past the
+ * i_size, but it's possible that we have outstanding
+ * writes with gaps in the middle and the i_size hasn't
+ * caught up yet. Populate those with zeroed out pages
+ * to prevent the VFS from repeatedly attempting to
+ * fill them until the writes are flushed.
+ */
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
+ } else {
+ /* no need to hold page hostage */
+ list_del(&page->lru);
+ lru_cache_add_file(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+
+ return len;
+}
+
static int cifs_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned num_pages)
{
@@ -2717,7 +3069,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
nr_pages++;
}
- rdata = cifs_readdata_alloc(nr_pages);
+ rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
if (!rdata) {
/* best to give up if we're out of mem */
list_for_each_entry_safe(page, tpage, &tmplist, lru) {
@@ -2731,24 +3083,16 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
}
spin_lock(&cifs_file_list_lock);
- cifsFileInfo_get(open_file);
spin_unlock(&cifs_file_list_lock);
- rdata->cfile = open_file;
+ rdata->cfile = cifsFileInfo_get(open_file);
rdata->mapping = mapping;
rdata->offset = offset;
rdata->bytes = bytes;
rdata->pid = pid;
+ rdata->marshal_iov = cifs_readpages_marshal_iov;
list_splice_init(&tmplist, &rdata->pages);
- do {
- if (open_file->invalidHandle) {
- rc = cifs_reopen_file(open_file, true);
- if (rc != 0)
- continue;
- }
- rc = cifs_async_readv(rdata);
- } while (rc == -EAGAIN);
-
+ rc = cifs_retry_async_readv(rdata);
if (rc != 0) {
list_for_each_entry_safe(page, tpage, &rdata->pages,
lru) {
@@ -2757,9 +3101,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
}
- cifs_readdata_free(rdata);
+ kref_put(&rdata->refcount, cifs_readdata_release);
break;
}
+
+ kref_put(&rdata->refcount, cifs_readdata_release);
}
return rc;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 4221b5e48a4..6d2667f0c98 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -51,7 +51,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
cifs_sb = CIFS_SB(inode->i_sb);
switch (command) {
+ static bool warned = false;
case CIFS_IOC_CHECKUMOUNT:
+ if (!warned) {
+ warned = true;
+ cERROR(1, "the CIFS_IOC_CHECKMOUNT ioctl will "
+ "be deprecated in 3.7. Please "
+ "migrate away from the use of "
+ "umount.cifs");
+ }
cFYI(1, "User unmount attempted");
if (cifs_sb->mnt_uid == current_uid())
rc = 0;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c29d1aa2c54..557506ae1e2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free)
return;
}
-/*
- * Find a free multiplex id (SMB mid). Otherwise there could be
- * mid collisions which might cause problems, demultiplexing the
- * wrong response to this request. Multiplex ids could collide if
- * one of a series requests takes much longer than the others, or
- * if a very large number of long lived requests (byte range
- * locks or FindNotify requests) are pending. No more than
- * 64K-1 requests can be outstanding at one time. If no
- * mids are available, return zero. A future optimization
- * could make the combination of mids and uid the key we use
- * to demultiplex on (rather than mid alone).
- * In addition to the above check, the cifs demultiplex
- * code already used the command code as a secondary
- * check of the frame and if signing is negotiated the
- * response would be discarded if the mid were the same
- * but the signature was wrong. Since the mid is not put in the
- * pending queue until later (when it is about to be dispatched)
- * we do have to limit the number of outstanding requests
- * to somewhat less than 64K-1 although it is hard to imagine
- * so many threads being in the vfs at one time.
- */
-__u64 GetNextMid(struct TCP_Server_Info *server)
-{
- __u64 mid = 0;
- __u16 last_mid, cur_mid;
- bool collision;
-
- spin_lock(&GlobalMid_Lock);
-
- /* mid is 16 bit only for CIFS/SMB */
- cur_mid = (__u16)((server->CurrentMid) & 0xffff);
- /* we do not want to loop forever */
- last_mid = cur_mid;
- cur_mid++;
-
- /*
- * This nested loop looks more expensive than it is.
- * In practice the list of pending requests is short,
- * fewer than 50, and the mids are likely to be unique
- * on the first pass through the loop unless some request
- * takes longer than the 64 thousand requests before it
- * (and it would also have to have been a request that
- * did not time out).
- */
- while (cur_mid != last_mid) {
- struct mid_q_entry *mid_entry;
- unsigned int num_mids;
-
- collision = false;
- if (cur_mid == 0)
- cur_mid++;
-
- num_mids = 0;
- list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
- ++num_mids;
- if (mid_entry->mid == cur_mid &&
- mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
- /* This mid is in use, try a different one */
- collision = true;
- break;
- }
- }
-
- /*
- * if we have more than 32k mids in the list, then something
- * is very wrong. Possibly a local user is trying to DoS the
- * box by issuing long-running calls and SIGKILL'ing them. If
- * we get to 2^16 mids then we're in big trouble as this
- * function could loop forever.
- *
- * Go ahead and assign out the mid in this situation, but force
- * an eventual reconnect to clean out the pending_mid_q.
- */
- if (num_mids > 32768)
- server->tcpStatus = CifsNeedReconnect;
-
- if (!collision) {
- mid = (__u64)cur_mid;
- server->CurrentMid = mid;
- break;
- }
- cur_mid++;
- }
- spin_unlock(&GlobalMid_Lock);
- return mid;
-}
-
/* NB: MID can not be set if treeCon not passed in, in that
case it is responsbility of caller to set the mid */
void
@@ -306,8 +219,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
const struct cifs_tcon *treeCon, int word_count
/* length of fixed section (word count) in two byte units */)
{
- struct list_head *temp_item;
- struct cifs_ses *ses;
char *temp = (char *) buffer;
memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
@@ -336,52 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
/* Uid is not converted */
buffer->Uid = treeCon->ses->Suid;
- buffer->Mid = GetNextMid(treeCon->ses->server);
- if (multiuser_mount != 0) {
- /* For the multiuser case, there are few obvious technically */
- /* possible mechanisms to match the local linux user (uid) */
- /* to a valid remote smb user (smb_uid): */
- /* 1) Query Winbind (or other local pam/nss daemon */
- /* for userid/password/logon_domain or credential */
- /* 2) Query Winbind for uid to sid to username mapping */
- /* and see if we have a matching password for existing*/
- /* session for that user perhas getting password by */
- /* adding a new pam_cifs module that stores passwords */
- /* so that the cifs vfs can get at that for all logged*/
- /* on users */
- /* 3) (Which is the mechanism we have chosen) */
- /* Search through sessions to the same server for a */
- /* a match on the uid that was passed in on mount */
- /* with the current processes uid (or euid?) and use */
- /* that smb uid. If no existing smb session for */
- /* that uid found, use the default smb session ie */
- /* the smb session for the volume mounted which is */
- /* the same as would be used if the multiuser mount */
- /* flag were disabled. */
-
- /* BB Add support for establishing new tCon and SMB Session */
- /* with userid/password pairs found on the smb session */
- /* for other target tcp/ip addresses BB */
- if (current_fsuid() != treeCon->ses->linux_uid) {
- cFYI(1, "Multiuser mode and UID "
- "did not match tcon uid");
- spin_lock(&cifs_tcp_ses_lock);
- list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
- ses = list_entry(temp_item, struct cifs_ses, smb_ses_list);
- if (ses->linux_uid == current_fsuid()) {
- if (ses->server == treeCon->ses->server) {
- cFYI(1, "found matching uid substitute right smb_uid");
- buffer->Uid = ses->Suid;
- break;
- } else {
- /* BB eventually call cifs_setup_session here */
- cFYI(1, "local UID found but no smb sess with this server exists");
- }
- }
- }
- spin_unlock(&cifs_tcp_ses_lock);
- }
- }
+ buffer->Mid = get_next_mid(treeCon->ses->server);
}
if (treeCon->Flags & SMB_SHARE_IS_IN_DFS)
buffer->Flags2 |= SMBFLG2_DFS;
@@ -700,22 +566,3 @@ backup_cred(struct cifs_sb_info *cifs_sb)
return false;
}
-
-void
-cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
-{
- spin_lock(&server->req_lock);
- server->credits += add;
- server->in_flight--;
- spin_unlock(&server->req_lock);
- wake_up(&server->request_q);
-}
-
-void
-cifs_set_credits(struct TCP_Server_Info *server, const int val)
-{
- spin_lock(&server->req_lock);
- server->credits = val;
- server->oplocks = val > 1 ? enable_oplocks : false;
- spin_unlock(&server->req_lock);
-}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 0a8224d1c4c..a4217f02fab 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -86,9 +86,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
dentry = d_lookup(parent, name);
if (dentry) {
- /* FIXME: check for inode number changes? */
- if (dentry->d_inode != NULL)
+ inode = dentry->d_inode;
+ /* update inode in place if i_ino didn't change */
+ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ cifs_fattr_to_inode(inode, fattr);
return dentry;
+ }
d_drop(dentry);
dput(dentry);
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
new file mode 100644
index 00000000000..6dec38f5522
--- /dev/null
+++ b/fs/cifs/smb1ops.c
@@ -0,0 +1,243 @@
+/*
+ * SMB1 (CIFS) version specific operations
+ *
+ * Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2 as published
+ * by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "cifspdu.h"
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, void *buf,
+ struct mid_q_entry *mid)
+{
+ int rc = 0;
+ struct smb_hdr *in_buf = (struct smb_hdr *)buf;
+
+ /* -4 for RFC1001 length and +2 for BCC field */
+ in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2);
+ in_buf->Command = SMB_COM_NT_CANCEL;
+ in_buf->WordCount = 0;
+ put_bcc(0, in_buf);
+
+ mutex_lock(&server->srv_mutex);
+ rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+ if (rc) {
+ mutex_unlock(&server->srv_mutex);
+ return rc;
+ }
+ rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+ mutex_unlock(&server->srv_mutex);
+
+ cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+ in_buf->Mid, rc);
+
+ return rc;
+}
+
+static bool
+cifs_compare_fids(struct cifsFileInfo *ob1, struct cifsFileInfo *ob2)
+{
+ return ob1->netfid == ob2->netfid;
+}
+
+static unsigned int
+cifs_read_data_offset(char *buf)
+{
+ READ_RSP *rsp = (READ_RSP *)buf;
+ return le16_to_cpu(rsp->DataOffset);
+}
+
+static unsigned int
+cifs_read_data_length(char *buf)
+{
+ READ_RSP *rsp = (READ_RSP *)buf;
+ return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
+ le16_to_cpu(rsp->DataLength);
+}
+
+static struct mid_q_entry *
+cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
+{
+ struct smb_hdr *buf = (struct smb_hdr *)buffer;
+ struct mid_q_entry *mid;
+
+ spin_lock(&GlobalMid_Lock);
+ list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+ if (mid->mid == buf->Mid &&
+ mid->mid_state == MID_REQUEST_SUBMITTED &&
+ le16_to_cpu(mid->command) == buf->Command) {
+ spin_unlock(&GlobalMid_Lock);
+ return mid;
+ }
+ }
+ spin_unlock(&GlobalMid_Lock);
+ return NULL;
+}
+
+static void
+cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
+{
+ spin_lock(&server->req_lock);
+ server->credits += add;
+ server->in_flight--;
+ spin_unlock(&server->req_lock);
+ wake_up(&server->request_q);
+}
+
+static void
+cifs_set_credits(struct TCP_Server_Info *server, const int val)
+{
+ spin_lock(&server->req_lock);
+ server->credits = val;
+ server->oplocks = val > 1 ? enable_oplocks : false;
+ spin_unlock(&server->req_lock);
+}
+
+static int *
+cifs_get_credits_field(struct TCP_Server_Info *server)
+{
+ return &server->credits;
+}
+
+/*
+ * Find a free multiplex id (SMB mid). Otherwise there could be
+ * mid collisions which might cause problems, demultiplexing the
+ * wrong response to this request. Multiplex ids could collide if
+ * one of a series requests takes much longer than the others, or
+ * if a very large number of long lived requests (byte range
+ * locks or FindNotify requests) are pending. No more than
+ * 64K-1 requests can be outstanding at one time. If no
+ * mids are available, return zero. A future optimization
+ * could make the combination of mids and uid the key we use
+ * to demultiplex on (rather than mid alone).
+ * In addition to the above check, the cifs demultiplex
+ * code already used the command code as a secondary
+ * check of the frame and if signing is negotiated the
+ * response would be discarded if the mid were the same
+ * but the signature was wrong. Since the mid is not put in the
+ * pending queue until later (when it is about to be dispatched)
+ * we do have to limit the number of outstanding requests
+ * to somewhat less than 64K-1 although it is hard to imagine
+ * so many threads being in the vfs at one time.
+ */
+static __u64
+cifs_get_next_mid(struct TCP_Server_Info *server)
+{
+ __u64 mid = 0;
+ __u16 last_mid, cur_mid;
+ bool collision;
+
+ spin_lock(&GlobalMid_Lock);
+
+ /* mid is 16 bit only for CIFS/SMB */
+ cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+ /* we do not want to loop forever */
+ last_mid = cur_mid;
+ cur_mid++;
+
+ /*
+ * This nested loop looks more expensive than it is.
+ * In practice the list of pending requests is short,
+ * fewer than 50, and the mids are likely to be unique
+ * on the first pass through the loop unless some request
+ * takes longer than the 64 thousand requests before it
+ * (and it would also have to have been a request that
+ * did not time out).
+ */
+ while (cur_mid != last_mid) {
+ struct mid_q_entry *mid_entry;
+ unsigned int num_mids;
+
+ collision = false;
+ if (cur_mid == 0)
+ cur_mid++;
+
+ num_mids = 0;
+ list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+ ++num_mids;
+ if (mid_entry->mid == cur_mid &&
+ mid_entry->mid_state == MID_REQUEST_SUBMITTED) {
+ /* This mid is in use, try a different one */
+ collision = true;
+ break;
+ }
+ }
+
+ /*
+ * if we have more than 32k mids in the list, then something
+ * is very wrong. Possibly a local user is trying to DoS the
+ * box by issuing long-running calls and SIGKILL'ing them. If
+ * we get to 2^16 mids then we're in big trouble as this
+ * function could loop forever.
+ *
+ * Go ahead and assign out the mid in this situation, but force
+ * an eventual reconnect to clean out the pending_mid_q.
+ */
+ if (num_mids > 32768)
+ server->tcpStatus = CifsNeedReconnect;
+
+ if (!collision) {
+ mid = (__u64)cur_mid;
+ server->CurrentMid = mid;
+ break;
+ }
+ cur_mid++;
+ }
+ spin_unlock(&GlobalMid_Lock);
+ return mid;
+}
+
+struct smb_version_operations smb1_operations = {
+ .send_cancel = send_nt_cancel,
+ .compare_fids = cifs_compare_fids,
+ .setup_request = cifs_setup_request,
+ .check_receive = cifs_check_receive,
+ .add_credits = cifs_add_credits,
+ .set_credits = cifs_set_credits,
+ .get_credits_field = cifs_get_credits_field,
+ .get_next_mid = cifs_get_next_mid,
+ .read_data_offset = cifs_read_data_offset,
+ .read_data_length = cifs_read_data_length,
+ .map_error = map_smb_to_linux_error,
+ .find_mid = cifs_find_mid,
+ .check_message = checkSMB,
+ .dump_detail = cifs_dump_detail,
+ .is_oplock_break = is_valid_oplock_break,
+};
+
+struct smb_version_values smb1_values = {
+ .version_string = SMB1_VERSION_STRING,
+ .large_lock_type = LOCKING_ANDX_LARGE_FILES,
+ .exclusive_lock_type = 0,
+ .shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
+ .unlock_lock_type = 0,
+ .header_size = sizeof(struct smb_hdr),
+ .max_header_size = MAX_CIFS_HDR_SIZE,
+ .read_rsp_size = sizeof(READ_RSP),
+};
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
new file mode 100644
index 00000000000..f065e89756a
--- /dev/null
+++ b/fs/cifs/smb2ops.c
@@ -0,0 +1,27 @@
+/*
+ * SMB2 version specific operations
+ *
+ * Copyright (c) 2012, Jeff Layton <jlayton@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2 as published
+ * by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifsglob.h"
+
+struct smb_version_operations smb21_operations = {
+};
+
+struct smb_version_values smb21_values = {
+ .version_string = SMB21_VERSION_STRING,
+};
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0961336513d..f25d4ea14be 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -304,7 +304,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int optype,
static int
wait_for_free_request(struct TCP_Server_Info *server, const int optype)
{
- return wait_for_free_credits(server, optype, get_credits_field(server));
+ return wait_for_free_credits(server, optype,
+ server->ops->get_credits_field(server));
}
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
@@ -364,16 +365,14 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct kvec *iov,
if (mid == NULL)
return -ENOMEM;
- /* put it on the pending_mid_q */
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&mid->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
-
rc = cifs_sign_smb2(iov, nvec, server, &mid->sequence_number);
- if (rc)
- delete_mid(mid);
+ if (rc) {
+ DeleteMidQEntry(mid);
+ return rc;
+ }
+
*ret_mid = mid;
- return rc;
+ return 0;
}
/*
@@ -396,7 +395,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
rc = cifs_setup_async_request(server, iov, nvec, &mid);
if (rc) {
mutex_unlock(&server->srv_mutex);
- cifs_add_credits(server, 1);
+ add_credits(server, 1);
wake_up(&server->request_q);
return rc;
}
@@ -406,19 +405,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
mid->callback_data = cbdata;
mid->mid_state = MID_REQUEST_SUBMITTED;
+ /* put it on the pending_mid_q */
+ spin_lock(&GlobalMid_Lock);
+ list_add_tail(&mid->qhead, &server->pending_mid_q);
+ spin_unlock(&GlobalMid_Lock);
+
+
cifs_in_send_inc(server);
rc = smb_sendv(server, iov, nvec);
cifs_in_send_dec(server);
cifs_save_when_sent(mid);
mutex_unlock(&server->srv_mutex);
- if (rc)
- goto out_err;
+ if (rc == 0)
+ return 0;
- return rc;
-out_err:
delete_mid(mid);
- cifs_add_credits(server, 1);
+ add_credits(server, 1);
wake_up(&server->request_q);
return rc;
}
@@ -483,41 +486,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
return rc;
}
-/*
- * An NT cancel request header looks just like the original request except:
- *
- * The Command is SMB_COM_NT_CANCEL
- * The WordCount is zeroed out
- * The ByteCount is zeroed out
- *
- * This function mangles an existing request buffer into a
- * SMB_COM_NT_CANCEL request and then sends it.
- */
-static int
-send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
- struct mid_q_entry *mid)
+static inline int
+send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid)
{
- int rc = 0;
-
- /* -4 for RFC1001 length and +2 for BCC field */
- in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2);
- in_buf->Command = SMB_COM_NT_CANCEL;
- in_buf->WordCount = 0;
- put_bcc(0, in_buf);
-
- mutex_lock(&server->srv_mutex);
- rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
- if (rc) {
- mutex_unlock(&server->srv_mutex);
- return rc;
- }
- rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
- mutex_unlock(&server->srv_mutex);
-
- cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
- in_buf->Mid, rc);
-
- return rc;
+ return server->ops->send_cancel ?
+ server->ops->send_cancel(server, buf, mid) : 0;
}
int
@@ -544,7 +517,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
return map_smb_to_linux_error(mid->resp_buf, log_error);
}
-static int
+int
cifs_setup_request(struct cifs_ses *ses, struct kvec *iov,
unsigned int nvec, struct mid_q_entry **ret_mid)
{
@@ -607,12 +580,12 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
mutex_lock(&ses->server->srv_mutex);
- rc = cifs_setup_request(ses, iov, n_vec, &midQ);
+ rc = ses->server->ops->setup_request(ses, iov, n_vec, &midQ);
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
cifs_small_buf_release(buf);
/* Update # of requests on wire to server */
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -636,13 +609,13 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
rc = wait_for_response(ses->server, midQ);
if (rc != 0) {
- send_nt_cancel(ses->server, (struct smb_hdr *)buf, midQ);
+ send_cancel(ses->server, buf, midQ);
spin_lock(&GlobalMid_Lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
cifs_small_buf_release(buf);
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -652,7 +625,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -670,14 +643,15 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
else
*pRespBufType = CIFS_SMALL_BUFFER;
- rc = cifs_check_receive(midQ, ses->server, flags & CIFS_LOG_ERROR);
+ rc = ses->server->ops->check_receive(midQ, ses->server,
+ flags & CIFS_LOG_ERROR);
/* mark it so buf will not be freed by delete_mid */
if ((flags & CIFS_NO_RESP) == 0)
midQ->resp_buf = NULL;
out:
delete_mid(midQ);
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -727,7 +701,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
mutex_unlock(&ses->server->srv_mutex);
/* Update # of requests on wire to server */
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -753,13 +727,13 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = wait_for_response(ses->server, midQ);
if (rc != 0) {
- send_nt_cancel(ses->server, in_buf, midQ);
+ send_cancel(ses->server, in_buf, midQ);
spin_lock(&GlobalMid_Lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
midQ->callback = DeleteMidQEntry;
spin_unlock(&GlobalMid_Lock);
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
spin_unlock(&GlobalMid_Lock);
@@ -767,7 +741,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sync_mid_result(midQ, ses->server);
if (rc != 0) {
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -783,7 +757,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_check_receive(midQ, ses->server, 0);
out:
delete_mid(midQ);
- cifs_add_credits(ses->server, 1);
+ add_credits(ses->server, 1);
return rc;
}
@@ -807,7 +781,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
pSMB->Timeout = 0;
- pSMB->hdr.Mid = GetNextMid(ses->server);
+ pSMB->hdr.Mid = get_next_mid(ses->server);
return SendReceive(xid, ses, in_buf, out_buf,
&bytes_returned, 0);
@@ -898,7 +872,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (in_buf->Command == SMB_COM_TRANSACTION2) {
/* POSIX lock. We send a NT_CANCEL SMB to cause the
blocking lock to return. */
- rc = send_nt_cancel(ses->server, in_buf, midQ);
+ rc = send_cancel(ses->server, in_buf, midQ);
if (rc) {
delete_mid(midQ);
return rc;
@@ -919,7 +893,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = wait_for_response(ses->server, midQ);
if (rc) {
- send_nt_cancel(ses->server, in_buf, midQ);
+ send_cancel(ses->server, in_buf, midQ);
spin_lock(&GlobalMid_Lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2870597b5c9..f1813120d75 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -244,7 +244,7 @@ static void coda_put_super(struct super_block *sb)
static void coda_evict_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
+ clear_inode(inode);
coda_cache_clear_inode(inode);
}
diff --git a/fs/compat.c b/fs/compat.c
index f2944ace7a7..6161255fac4 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
- SET_UID(tmp.st_uid, stat->uid);
- SET_GID(tmp.st_gid, stat->gid);
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
tmp.st_rdev = old_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
@@ -532,7 +532,7 @@ out:
ssize_t compat_rw_copy_check_uvector(int type,
const struct compat_iovec __user *uvector, unsigned long nr_segs,
unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer, int check_access)
+ struct iovec **ret_pointer)
{
compat_ssize_t tot_len;
struct iovec *iov = *ret_pointer = fast_pointer;
@@ -579,7 +579,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
}
if (len < 0) /* size_t not fitting in compat_ssize_t .. */
goto out;
- if (check_access &&
+ if (type >= 0 &&
!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
ret = -EFAULT;
goto out;
@@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
{
int error;
struct file *file;
+ int fput_needed;
struct compat_readdir_callback buf;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.result = 0;
buf.dirent = dirent;
@@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
if (buf.result)
error = buf.result;
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
struct file * file;
struct compat_linux_dirent __user * lastdirent;
struct compat_getdents_callback buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
@@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
struct file * file;
struct linux_dirent64 __user * lastdirent;
struct compat_getdents_callback64 buf;
+ int fput_needed;
int error;
- error = -EFAULT;
if (!access_ok(VERIFY_WRITE, dirent, count))
- goto out;
+ return -EFAULT;
- error = -EBADF;
- file = fget(fd);
+ file = fget_light(fd, &fput_needed);
if (!file)
- goto out;
+ return -EBADF;
buf.current_dir = dirent;
buf.previous = NULL;
@@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
else
error = count - buf.count;
}
- fput(file);
-out:
+ fput_light(file, fput_needed);
return error;
}
#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
@@ -1094,7 +1089,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
goto out;
tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov, 1);
+ UIO_FASTIOV, iovstack, &iov);
if (tot_len == 0) {
ret = 0;
goto out;
@@ -1547,7 +1542,6 @@ asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
compat_ptr(a.exp), compat_ptr(a.tvp));
}
-#ifdef HAVE_SET_RESTORE_SIGMASK
static long do_compat_pselect(int n, compat_ulong_t __user *inp,
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
@@ -1670,11 +1664,9 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
return ret;
}
-#endif /* HAVE_SET_RESTORE_SIGMASK */
#ifdef CONFIG_EPOLL
-#ifdef HAVE_SET_RESTORE_SIGMASK
asmlinkage long compat_sys_epoll_pwait(int epfd,
struct compat_epoll_event __user *events,
int maxevents, int timeout,
@@ -1718,7 +1710,6 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
return err;
}
-#endif /* HAVE_SET_RESTORE_SIGMASK */
#endif /* CONFIG_EPOLL */
diff --git a/fs/dcache.c b/fs/dcache.c
index 4d4852dbaa4..44de6629c8c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -153,16 +153,12 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
* In contrast, 'ct' and 'tcount' can be from a pathname, and do
* need the careful unaligned handling.
*/
-static inline int dentry_cmp(const unsigned char *cs, size_t scount,
- const unsigned char *ct, size_t tcount)
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
unsigned long a,b,mask;
- if (unlikely(scount != tcount))
- return 1;
-
for (;;) {
- a = load_unaligned_zeropad(cs);
+ a = *(unsigned long *)cs;
b = load_unaligned_zeropad(ct);
if (tcount < sizeof(unsigned long))
break;
@@ -180,12 +176,8 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
#else
-static inline int dentry_cmp(const unsigned char *cs, size_t scount,
- const unsigned char *ct, size_t tcount)
+static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
- if (scount != tcount)
- return 1;
-
do {
if (*cs != *ct)
return 1;
@@ -198,6 +190,30 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
#endif
+static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
+{
+ const unsigned char *cs;
+ /*
+ * Be careful about RCU walk racing with rename:
+ * use ACCESS_ONCE to fetch the name pointer.
+ *
+ * NOTE! Even if a rename will mean that the length
+ * was not loaded atomically, we don't care. The
+ * RCU walk will check the sequence count eventually,
+ * and catch it. And we won't overrun the buffer,
+ * because we're reading the name pointer atomically,
+ * and a dentry name is guaranteed to be properly
+ * terminated with a NUL byte.
+ *
+ * End result: even if 'len' is wrong, we'll exit
+ * early because the data cannot match (there can
+ * be no NUL in the ct/tcount data)
+ */
+ cs = ACCESS_ONCE(dentry->d_name.name);
+ smp_read_barrier_depends();
+ return dentry_string_cmp(cs, ct, tcount);
+}
+
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
@@ -1258,6 +1274,13 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
if (!dentry)
return NULL;
+ /*
+ * We guarantee that the inline name is always NUL-terminated.
+ * This way the memcpy() done by the name switching in rename
+ * will still always have a NUL at the end, even if we might
+ * be overwriting an internal NUL character
+ */
+ dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
dname = kmalloc(name->len + 1, GFP_KERNEL);
if (!dname) {
@@ -1267,13 +1290,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
} else {
dname = dentry->d_iname;
}
- dentry->d_name.name = dname;
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
+ /* Make sure we always see the terminating NUL character */
+ smp_wmb();
+ dentry->d_name.name = dname;
+
dentry->d_count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
@@ -1439,18 +1465,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
}
list_for_each_entry(alias, &inode->i_dentry, d_alias) {
- struct qstr *qstr = &alias->d_name;
-
/*
* Don't need alias->d_lock here, because aliases with
* d_parent == entry->d_parent are not subject to name or
* parent changes, because the parent inode i_mutex is held.
*/
- if (qstr->hash != hash)
+ if (alias->d_name.hash != hash)
continue;
if (alias->d_parent != entry->d_parent)
continue;
- if (dentry_cmp(qstr->name, qstr->len, name, len))
+ if (alias->d_name.len != len)
+ continue;
+ if (dentry_cmp(alias, name, len))
continue;
__dget(alias);
return alias;
@@ -1489,7 +1515,7 @@ struct dentry *d_make_root(struct inode *root_inode)
struct dentry *res = NULL;
if (root_inode) {
- static const struct qstr name = { .name = "/", .len = 1 };
+ static const struct qstr name = QSTR_INIT("/", 1);
res = __d_alloc(root_inode->i_sb, &name);
if (res)
@@ -1727,6 +1753,48 @@ err_out:
}
EXPORT_SYMBOL(d_add_ci);
+/*
+ * Do the slow-case of the dentry name compare.
+ *
+ * Unlike the dentry_cmp() function, we need to atomically
+ * load the name, length and inode information, so that the
+ * filesystem can rely on them, and can use the 'name' and
+ * 'len' information without worrying about walking off the
+ * end of memory etc.
+ *
+ * Thus the read_seqcount_retry() and the "duplicate" info
+ * in arguments (the low-level filesystem should not look
+ * at the dentry inode or name contents directly, since
+ * rename can change them while we're in RCU mode).
+ */
+enum slow_d_compare {
+ D_COMP_OK,
+ D_COMP_NOMATCH,
+ D_COMP_SEQRETRY,
+};
+
+static noinline enum slow_d_compare slow_dentry_cmp(
+ const struct dentry *parent,
+ struct inode *inode,
+ struct dentry *dentry,
+ unsigned int seq,
+ const struct qstr *name)
+{
+ int tlen = dentry->d_name.len;
+ const char *tname = dentry->d_name.name;
+ struct inode *i = dentry->d_inode;
+
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
+ return D_COMP_SEQRETRY;
+ }
+ if (parent->d_op->d_compare(parent, inode,
+ dentry, i,
+ tlen, tname, name))
+ return D_COMP_NOMATCH;
+ return D_COMP_OK;
+}
+
/**
* __d_lookup_rcu - search for a dentry (racy, store-free)
* @parent: parent dentry
@@ -1753,15 +1821,17 @@ EXPORT_SYMBOL(d_add_ci);
* the returned dentry, so long as its parent's seqlock is checked after the
* child is looked up. Thus, an interlocking stepping of sequence lock checks
* is formed, giving integrity down the path walk.
+ *
+ * NOTE! The caller *has* to check the resulting dentry against the sequence
+ * number we've returned before using any of the resulting dentry state!
*/
struct dentry *__d_lookup_rcu(const struct dentry *parent,
const struct qstr *name,
- unsigned *seqp, struct inode **inode)
+ unsigned *seqp, struct inode *inode)
{
- unsigned int len = name->len;
- unsigned int hash = name->hash;
+ u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hash);
+ struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -1787,49 +1857,47 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
*/
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
- struct inode *i;
- const char *tname;
- int tlen;
-
- if (dentry->d_name.hash != hash)
- continue;
seqretry:
- seq = read_seqcount_begin(&dentry->d_seq);
+ /*
+ * The dentry sequence count protects us from concurrent
+ * renames, and thus protects inode, parent and name fields.
+ *
+ * The caller must perform a seqcount check in order
+ * to do anything useful with the returned dentry,
+ * including using the 'd_inode' pointer.
+ *
+ * NOTE! We do a "raw" seqcount_begin here. That means that
+ * we don't wait for the sequence count to stabilize if it
+ * is in the middle of a sequence change. If we do the slow
+ * dentry compare, we will do seqretries until it is stable,
+ * and if we end up with a successful lookup, we actually
+ * want to exit RCU lookup anyway.
+ */
+ seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
continue;
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
- i = dentry->d_inode;
- prefetch(tname);
- /*
- * This seqcount check is required to ensure name and
- * len are loaded atomically, so as not to walk off the
- * edge of memory when walking. If we could load this
- * atomically some other way, we could drop this check.
- */
- if (read_seqcount_retry(&dentry->d_seq, seq))
- goto seqretry;
+ *seqp = seq;
+
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
- if (parent->d_op->d_compare(parent, *inode,
- dentry, i,
- tlen, tname, name))
+ if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
- } else {
- if (dentry_cmp(tname, tlen, str, len))
+ switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+ case D_COMP_OK:
+ return dentry;
+ case D_COMP_NOMATCH:
continue;
+ default:
+ goto seqretry;
+ }
}
- /*
- * No extra seqcount check is required after the name
- * compare. The caller must perform a seqcount check in
- * order to do anything useful with the returned dentry
- * anyway.
- */
- *seqp = seq;
- *inode = i;
- return dentry;
+
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
+ return dentry;
}
return NULL;
}
@@ -1908,8 +1976,6 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
rcu_read_lock();
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
- const char *tname;
- int tlen;
if (dentry->d_name.hash != hash)
continue;
@@ -1924,15 +1990,17 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
*/
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
if (parent->d_flags & DCACHE_OP_COMPARE) {
+ int tlen = dentry->d_name.len;
+ const char *tname = dentry->d_name.name;
if (parent->d_op->d_compare(parent, parent->d_inode,
dentry, dentry->d_inode,
tlen, tname, name))
goto next;
} else {
- if (dentry_cmp(tname, tlen, str, len))
+ if (dentry->d_name.len != len)
+ goto next;
+ if (dentry_cmp(dentry, str, len))
goto next;
}
@@ -2507,7 +2575,7 @@ static int prepend_path(const struct path *path,
bool slash = false;
int error = 0;
- br_read_lock(vfsmount_lock);
+ br_read_lock(&vfsmount_lock);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
@@ -2538,7 +2606,7 @@ static int prepend_path(const struct path *path,
error = prepend(buffer, buflen, "/", 1);
out:
- br_read_unlock(vfsmount_lock);
+ br_read_unlock(&vfsmount_lock);
return error;
global_root:
@@ -3027,6 +3095,7 @@ static void __init dcache_init_early(void)
HASH_EARLY,
&d_hash_shift,
&d_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << d_hash_shift); loop++)
@@ -3057,6 +3126,7 @@ static void __init dcache_init(void)
0,
&d_hash_shift,
&d_hash_mask,
+ 0,
0);
for (loop = 0; loop < (1U << d_hash_shift); loop++)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 5dfafdd1dbd..2340f6978d6 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/io.h>
+#include <linux/slab.h>
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
+struct array_data {
+ void *array;
+ u32 elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+ u32 *array, u32 array_size)
+{
+ size_t ret = 0;
+ u32 i;
+
+ for (i = 0; i < array_size; i++) {
+ size_t len;
+
+ len = snprintf(buf, bufsize, fmt, array[i]);
+ len++; /* ' ' or '\n' */
+ ret += len;
+
+ if (buf) {
+ buf += len;
+ bufsize -= len;
+ buf[-1] = (i == array_size-1) ? '\n' : ' ';
+ }
+ }
+
+ ret++; /* \0 */
+ if (buf)
+ *buf = '\0';
+
+ return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array,
+ u32 array_size)
+{
+ size_t len = format_array(NULL, 0, fmt, array, array_size);
+ char *ret;
+
+ ret = kmalloc(len, GFP_KERNEL);
+ if (ret == NULL)
+ return NULL;
+
+ format_array(ret, len, fmt, array, array_size);
+ return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct array_data *data = inode->i_private;
+ size_t size;
+
+ if (*ppos == 0) {
+ if (file->private_data) {
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+
+ file->private_data = format_array_alloc("%u", data->array,
+ data->elements);
+ }
+
+ size = 0;
+ if (file->private_data)
+ size = strlen(file->private_data);
+
+ return simple_read_from_buffer(buf, len, ppos,
+ file->private_data, size);
+}
+
+static int u32_array_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+
+ return 0;
+}
+
+static const struct file_operations u32_array_fops = {
+ .owner = THIS_MODULE,
+ .open = u32_array_open,
+ .release = u32_array_release,
+ .read = u32_array_read,
+ .llseek = no_llseek,
+};
+
+/**
+ * debugfs_create_u32_array - create a debugfs file that is used to read u32
+ * array.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @array: u32 array that provides data.
+ * @elements: total number of elements in the array.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @array as data. If the @mode variable is so set it can be read from.
+ * Writing is not supported. Seek within the file is also not supported.
+ * Once array is created its size can not be changed.
+ *
+ * The function returns a pointer to dentry on success. If debugfs is not
+ * enabled in the kernel, the value -%ENODEV will be returned.
+ */
+struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+ struct dentry *parent,
+ u32 *array, u32 elements)
+{
+ struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data == NULL)
+ return NULL;
+
+ data->array = array;
+ data->elements = elements;
+
+ return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_u32_array);
+
#ifdef CONFIG_HAS_IOMEM
/*
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 10f5e0b484d..979c1e309c7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -98,8 +98,8 @@ static struct vfsmount *devpts_mnt;
struct pts_mount_opts {
int setuid;
int setgid;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
umode_t mode;
umode_t ptmxmode;
int newinstance;
@@ -158,11 +158,13 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
{
char *p;
+ kuid_t uid;
+ kgid_t gid;
opts->setuid = 0;
opts->setgid = 0;
- opts->uid = 0;
- opts->gid = 0;
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
opts->mode = DEVPTS_DEFAULT_MODE;
opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
opts->max = NR_UNIX98_PTY_MAX;
@@ -184,13 +186,19 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
case Opt_uid:
if (match_int(&args[0], &option))
return -EINVAL;
- opts->uid = option;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
opts->setuid = 1;
break;
case Opt_gid:
if (match_int(&args[0], &option))
return -EINVAL;
- opts->gid = option;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
opts->setgid = 1;
break;
case Opt_mode:
@@ -315,9 +323,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
struct pts_mount_opts *opts = &fsi->mount_opts;
if (opts->setuid)
- seq_printf(seq, ",uid=%u", opts->uid);
+ seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid));
if (opts->setgid)
- seq_printf(seq, ",gid=%u", opts->gid);
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f4aadd15b61..0c85fae3766 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -145,50 +145,6 @@ struct dio {
static struct kmem_cache *dio_cache __read_mostly;
-static void __inode_dio_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
- DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
- do {
- prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(&inode->i_dio_count))
- schedule();
- } while (atomic_read(&inode->i_dio_count));
- finish_wait(wq, &q.wait);
-}
-
-/**
- * inode_dio_wait - wait for outstanding DIO requests to finish
- * @inode: inode to wait for
- *
- * Waits for all pending direct I/O requests to finish so that we can
- * proceed with a truncate or equivalent operation.
- *
- * Must be called under a lock that serializes taking new references
- * to i_dio_count, usually by inode->i_mutex.
- */
-void inode_dio_wait(struct inode *inode)
-{
- if (atomic_read(&inode->i_dio_count))
- __inode_dio_wait(inode);
-}
-EXPORT_SYMBOL(inode_dio_wait);
-
-/*
- * inode_dio_done - signal finish of a direct I/O requests
- * @inode: inode the direct I/O happens on
- *
- * This is called once we've finished processing a direct I/O request,
- * and is used to wake up callers waiting for direct I/O to be quiesced.
- */
-void inode_dio_done(struct inode *inode)
-{
- if (atomic_dec_and_test(&inode->i_dio_count))
- wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
-}
-EXPORT_SYMBOL(inode_dio_done);
-
/*
* How many pages are in the queue?
*/
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 90e5997262e..63dc19c54d5 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -310,6 +310,7 @@ void dlm_callback_resume(struct dlm_ls *ls)
}
mutex_unlock(&ls->ls_cb_mutex);
- log_debug(ls, "dlm_callback_resume %d", count);
+ if (count)
+ log_debug(ls, "dlm_callback_resume %d", count);
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 3a564d197e9..bc342f7ac3a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -38,6 +38,7 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/idr.h>
+#include <linux/ratelimit.h>
#include <asm/uaccess.h>
#include <linux/dlm.h>
@@ -74,6 +75,13 @@ do { \
(ls)->ls_name , ##args); \
} while (0)
+#define log_limit(ls, fmt, args...) \
+do { \
+ if (dlm_config.ci_log_debug) \
+ printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \<