From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- fs/xfs/Kconfig | 85 + fs/xfs/Makefile | 150 + fs/xfs/linux-2.6/kmem.c | 134 + fs/xfs/linux-2.6/kmem.h | 157 + fs/xfs/linux-2.6/mrlock.h | 106 + fs/xfs/linux-2.6/mutex.h | 53 + fs/xfs/linux-2.6/sema.h | 67 + fs/xfs/linux-2.6/spin.h | 56 + fs/xfs/linux-2.6/sv.h | 89 + fs/xfs/linux-2.6/time.h | 51 + fs/xfs/linux-2.6/xfs_aops.c | 1275 ++++++++ fs/xfs/linux-2.6/xfs_buf.c | 1980 +++++++++++++ fs/xfs/linux-2.6/xfs_buf.h | 591 ++++ fs/xfs/linux-2.6/xfs_cred.h | 50 + fs/xfs/linux-2.6/xfs_export.c | 205 ++ fs/xfs/linux-2.6/xfs_export.h | 122 + fs/xfs/linux-2.6/xfs_file.c | 573 ++++ fs/xfs/linux-2.6/xfs_fs_subr.c | 124 + fs/xfs/linux-2.6/xfs_fs_subr.h | 48 + fs/xfs/linux-2.6/xfs_globals.c | 74 + fs/xfs/linux-2.6/xfs_globals.h | 44 + fs/xfs/linux-2.6/xfs_ioctl.c | 1336 +++++++++ fs/xfs/linux-2.6/xfs_ioctl32.c | 163 ++ fs/xfs/linux-2.6/xfs_ioctl32.h | 34 + fs/xfs/linux-2.6/xfs_iops.c | 680 +++++ fs/xfs/linux-2.6/xfs_iops.h | 51 + fs/xfs/linux-2.6/xfs_linux.h | 374 +++ fs/xfs/linux-2.6/xfs_lrw.c | 1082 +++++++ fs/xfs/linux-2.6/xfs_lrw.h | 116 + fs/xfs/linux-2.6/xfs_stats.c | 132 + fs/xfs/linux-2.6/xfs_stats.h | 166 ++ fs/xfs/linux-2.6/xfs_super.c | 912 ++++++ fs/xfs/linux-2.6/xfs_super.h | 138 + fs/xfs/linux-2.6/xfs_sysctl.c | 174 ++ fs/xfs/linux-2.6/xfs_sysctl.h | 114 + fs/xfs/linux-2.6/xfs_version.h | 44 + fs/xfs/linux-2.6/xfs_vfs.c | 330 +++ fs/xfs/linux-2.6/xfs_vfs.h | 223 ++ fs/xfs/linux-2.6/xfs_vnode.c | 455 +++ fs/xfs/linux-2.6/xfs_vnode.h | 666 +++++ fs/xfs/quota/xfs_dquot.c | 1648 +++++++++++ fs/xfs/quota/xfs_dquot.h | 224 ++ fs/xfs/quota/xfs_dquot_item.c | 715 +++++ fs/xfs/quota/xfs_dquot_item.h | 66 + fs/xfs/quota/xfs_qm.c | 2848 ++++++++++++++++++ fs/xfs/quota/xfs_qm.h | 236 ++ fs/xfs/quota/xfs_qm_bhv.c | 410 +++ fs/xfs/quota/xfs_qm_stats.c | 149 + fs/xfs/quota/xfs_qm_stats.h | 68 + fs/xfs/quota/xfs_qm_syscalls.c | 1458 ++++++++++ fs/xfs/quota/xfs_quota_priv.h | 192 ++ fs/xfs/quota/xfs_trans_dquot.c | 941 ++++++ fs/xfs/support/debug.c | 127 + fs/xfs/support/debug.h | 72 + fs/xfs/support/ktrace.c | 346 +++ fs/xfs/support/ktrace.h | 101 + fs/xfs/support/move.c | 66 + fs/xfs/support/move.h | 84 + fs/xfs/support/qsort.c | 155 + fs/xfs/support/qsort.h | 41 + fs/xfs/support/uuid.c | 151 + fs/xfs/support/uuid.h | 48 + fs/xfs/xfs.h | 40 + fs/xfs/xfs_acl.c | 937 ++++++ fs/xfs/xfs_acl.h | 116 + fs/xfs/xfs_ag.h | 345 +++ fs/xfs/xfs_alloc.c | 2623 +++++++++++++++++ fs/xfs/xfs_alloc.h | 203 ++ fs/xfs/xfs_alloc_btree.c | 2204 ++++++++++++++ fs/xfs/xfs_alloc_btree.h | 257 ++ fs/xfs/xfs_arch.h | 213 ++ fs/xfs/xfs_attr.c | 2660 +++++++++++++++++ fs/xfs/xfs_attr.h | 193 ++ fs/xfs/xfs_attr_leaf.c | 3050 ++++++++++++++++++++ fs/xfs/xfs_attr_leaf.h | 308 ++ fs/xfs/xfs_attr_sf.h | 149 + fs/xfs/xfs_behavior.c | 218 ++ fs/xfs/xfs_behavior.h | 204 ++ fs/xfs/xfs_bit.c | 312 ++ fs/xfs/xfs_bit.h | 85 + fs/xfs/xfs_bmap.c | 6246 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap.h | 379 +++ fs/xfs/xfs_bmap_btree.c | 2807 ++++++++++++++++++ fs/xfs/xfs_bmap_btree.h | 701 +++++ fs/xfs/xfs_btree.c | 949 ++++++ fs/xfs/xfs_btree.h | 592 ++++ fs/xfs/xfs_buf_item.c | 1221 ++++++++ fs/xfs/xfs_buf_item.h | 171 ++ fs/xfs/xfs_cap.h | 84 + fs/xfs/xfs_clnt.h | 110 + fs/xfs/xfs_da_btree.c | 2648 +++++++++++++++++ fs/xfs/xfs_da_btree.h | 335 +++ fs/xfs/xfs_dfrag.c | 387 +++ fs/xfs/xfs_dfrag.h | 67 + fs/xfs/xfs_dinode.h | 418 +++ fs/xfs/xfs_dir.c | 1223 ++++++++ fs/xfs/xfs_dir.h | 154 + fs/xfs/xfs_dir2.c | 859 ++++++ fs/xfs/xfs_dir2.h | 109 + fs/xfs/xfs_dir2_block.c | 1248 ++++++++ fs/xfs/xfs_dir2_block.h | 126 + fs/xfs/xfs_dir2_data.c | 855 ++++++ fs/xfs/xfs_dir2_data.h | 231 ++ fs/xfs/xfs_dir2_leaf.c | 1896 ++++++++++++ fs/xfs/xfs_dir2_leaf.h | 360 +++ fs/xfs/xfs_dir2_node.c | 2020 +++++++++++++ fs/xfs/xfs_dir2_node.h | 159 + fs/xfs/xfs_dir2_sf.c | 1317 +++++++++ fs/xfs/xfs_dir2_sf.h | 243 ++ fs/xfs/xfs_dir2_trace.c | 235 ++ fs/xfs/xfs_dir2_trace.h | 86 + fs/xfs/xfs_dir_leaf.c | 2231 ++++++++++++++ fs/xfs/xfs_dir_leaf.h | 248 ++ fs/xfs/xfs_dir_sf.h | 172 ++ fs/xfs/xfs_dmapi.h | 212 ++ fs/xfs/xfs_dmops.c | 52 + fs/xfs/xfs_error.c | 327 +++ fs/xfs/xfs_error.h | 196 ++ fs/xfs/xfs_extfree_item.c | 668 +++++ fs/xfs/xfs_extfree_item.h | 123 + fs/xfs/xfs_fs.h | 527 ++++ fs/xfs/xfs_fsops.c | 616 ++++ fs/xfs/xfs_fsops.h | 67 + fs/xfs/xfs_ialloc.c | 1401 +++++++++ fs/xfs/xfs_ialloc.h | 184 ++ fs/xfs/xfs_ialloc_btree.c | 2094 ++++++++++++++ fs/xfs/xfs_ialloc_btree.h | 314 ++ fs/xfs/xfs_iget.c | 1022 +++++++ fs/xfs/xfs_imap.h | 54 + fs/xfs/xfs_inode.c | 3876 +++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 554 ++++ fs/xfs/xfs_inode_item.c | 1092 +++++++ fs/xfs/xfs_inode_item.h | 197 ++ fs/xfs/xfs_inum.h | 173 ++ fs/xfs/xfs_iocore.c | 133 + fs/xfs/xfs_iomap.c | 1000 +++++++ fs/xfs/xfs_iomap.h | 107 + fs/xfs/xfs_itable.c | 858 ++++++ fs/xfs/xfs_itable.h | 106 + fs/xfs/xfs_log.c | 3560 +++++++++++++++++++++++ fs/xfs/xfs_log.h | 182 ++ fs/xfs/xfs_log_priv.h | 561 ++++ fs/xfs/xfs_log_recover.c | 4098 ++++++++++++++++++++++++++ fs/xfs/xfs_log_recover.h | 81 + fs/xfs/xfs_mac.h | 120 + fs/xfs/xfs_macros.c | 2136 ++++++++++++++ fs/xfs/xfs_macros.h | 104 + fs/xfs/xfs_mount.c | 1586 ++++++++++ fs/xfs/xfs_mount.h | 573 ++++ fs/xfs/xfs_qmops.c | 71 + fs/xfs/xfs_quota.h | 356 +++ fs/xfs/xfs_refcache.h | 66 + fs/xfs/xfs_rename.c | 673 +++++ fs/xfs/xfs_rtalloc.c | 2469 ++++++++++++++++ fs/xfs/xfs_rtalloc.h | 187 ++ fs/xfs/xfs_rw.c | 356 +++ fs/xfs/xfs_rw.h | 154 + fs/xfs/xfs_sb.h | 583 ++++ fs/xfs/xfs_trans.c | 1315 +++++++++ fs/xfs/xfs_trans.h | 1042 +++++++ fs/xfs/xfs_trans_ail.c | 596 ++++ fs/xfs/xfs_trans_buf.c | 1093 +++++++ fs/xfs/xfs_trans_extfree.c | 156 + fs/xfs/xfs_trans_inode.c | 342 +++ fs/xfs/xfs_trans_item.c | 553 ++++ fs/xfs/xfs_trans_priv.h | 73 + fs/xfs/xfs_trans_space.h | 105 + fs/xfs/xfs_types.h | 182 ++ fs/xfs/xfs_utils.c | 488 ++++ fs/xfs/xfs_utils.h | 52 + fs/xfs/xfs_vfsops.c | 1941 +++++++++++++ fs/xfs/xfs_vnodeops.c | 4712 ++++++++++++++++++++++++++++++ 172 files changed, 114893 insertions(+) create mode 100644 fs/xfs/Kconfig create mode 100644 fs/xfs/Makefile create mode 100644 fs/xfs/linux-2.6/kmem.c create mode 100644 fs/xfs/linux-2.6/kmem.h create mode 100644 fs/xfs/linux-2.6/mrlock.h create mode 100644 fs/xfs/linux-2.6/mutex.h create mode 100644 fs/xfs/linux-2.6/sema.h create mode 100644 fs/xfs/linux-2.6/spin.h create mode 100644 fs/xfs/linux-2.6/sv.h create mode 100644 fs/xfs/linux-2.6/time.h create mode 100644 fs/xfs/linux-2.6/xfs_aops.c create mode 100644 fs/xfs/linux-2.6/xfs_buf.c create mode 100644 fs/xfs/linux-2.6/xfs_buf.h create mode 100644 fs/xfs/linux-2.6/xfs_cred.h create mode 100644 fs/xfs/linux-2.6/xfs_export.c create mode 100644 fs/xfs/linux-2.6/xfs_export.h create mode 100644 fs/xfs/linux-2.6/xfs_file.c create mode 100644 fs/xfs/linux-2.6/xfs_fs_subr.c create mode 100644 fs/xfs/linux-2.6/xfs_fs_subr.h create mode 100644 fs/xfs/linux-2.6/xfs_globals.c create mode 100644 fs/xfs/linux-2.6/xfs_globals.h create mode 100644 fs/xfs/linux-2.6/xfs_ioctl.c create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.c create mode 100644 fs/xfs/linux-2.6/xfs_ioctl32.h create mode 100644 fs/xfs/linux-2.6/xfs_iops.c create mode 100644 fs/xfs/linux-2.6/xfs_iops.h create mode 100644 fs/xfs/linux-2.6/xfs_linux.h create mode 100644 fs/xfs/linux-2.6/xfs_lrw.c create mode 100644 fs/xfs/linux-2.6/xfs_lrw.h create mode 100644 fs/xfs/linux-2.6/xfs_stats.c create mode 100644 fs/xfs/linux-2.6/xfs_stats.h create mode 100644 fs/xfs/linux-2.6/xfs_super.c create mode 100644 fs/xfs/linux-2.6/xfs_super.h create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.c create mode 100644 fs/xfs/linux-2.6/xfs_sysctl.h create mode 100644 fs/xfs/linux-2.6/xfs_version.h create mode 100644 fs/xfs/linux-2.6/xfs_vfs.c create mode 100644 fs/xfs/linux-2.6/xfs_vfs.h create mode 100644 fs/xfs/linux-2.6/xfs_vnode.c create mode 100644 fs/xfs/linux-2.6/xfs_vnode.h create mode 100644 fs/xfs/quota/xfs_dquot.c create mode 100644 fs/xfs/quota/xfs_dquot.h create mode 100644 fs/xfs/quota/xfs_dquot_item.c create mode 100644 fs/xfs/quota/xfs_dquot_item.h create mode 100644 fs/xfs/quota/xfs_qm.c create mode 100644 fs/xfs/quota/xfs_qm.h create mode 100644 fs/xfs/quota/xfs_qm_bhv.c create mode 100644 fs/xfs/quota/xfs_qm_stats.c create mode 100644 fs/xfs/quota/xfs_qm_stats.h create mode 100644 fs/xfs/quota/xfs_qm_syscalls.c create mode 100644 fs/xfs/quota/xfs_quota_priv.h create mode 100644 fs/xfs/quota/xfs_trans_dquot.c create mode 100644 fs/xfs/support/debug.c create mode 100644 fs/xfs/support/debug.h create mode 100644 fs/xfs/support/ktrace.c create mode 100644 fs/xfs/support/ktrace.h create mode 100644 fs/xfs/support/move.c create mode 100644 fs/xfs/support/move.h create mode 100644 fs/xfs/support/qsort.c create mode 100644 fs/xfs/support/qsort.h create mode 100644 fs/xfs/support/uuid.c create mode 100644 fs/xfs/support/uuid.h create mode 100644 fs/xfs/xfs.h create mode 100644 fs/xfs/xfs_acl.c create mode 100644 fs/xfs/xfs_acl.h create mode 100644 fs/xfs/xfs_ag.h create mode 100644 fs/xfs/xfs_alloc.c create mode 100644 fs/xfs/xfs_alloc.h create mode 100644 fs/xfs/xfs_alloc_btree.c create mode 100644 fs/xfs/xfs_alloc_btree.h create mode 100644 fs/xfs/xfs_arch.h create mode 100644 fs/xfs/xfs_attr.c create mode 100644 fs/xfs/xfs_attr.h create mode 100644 fs/xfs/xfs_attr_leaf.c create mode 100644 fs/xfs/xfs_attr_leaf.h create mode 100644 fs/xfs/xfs_attr_sf.h create mode 100644 fs/xfs/xfs_behavior.c create mode 100644 fs/xfs/xfs_behavior.h create mode 100644 fs/xfs/xfs_bit.c create mode 100644 fs/xfs/xfs_bit.h create mode 100644 fs/xfs/xfs_bmap.c create mode 100644 fs/xfs/xfs_bmap.h create mode 100644 fs/xfs/xfs_bmap_btree.c create mode 100644 fs/xfs/xfs_bmap_btree.h create mode 100644 fs/xfs/xfs_btree.c create mode 100644 fs/xfs/xfs_btree.h create mode 100644 fs/xfs/xfs_buf_item.c create mode 100644 fs/xfs/xfs_buf_item.h create mode 100644 fs/xfs/xfs_cap.h create mode 100644 fs/xfs/xfs_clnt.h create mode 100644 fs/xfs/xfs_da_btree.c create mode 100644 fs/xfs/xfs_da_btree.h create mode 100644 fs/xfs/xfs_dfrag.c create mode 100644 fs/xfs/xfs_dfrag.h create mode 100644 fs/xfs/xfs_dinode.h create mode 100644 fs/xfs/xfs_dir.c create mode 100644 fs/xfs/xfs_dir.h create mode 100644 fs/xfs/xfs_dir2.c create mode 100644 fs/xfs/xfs_dir2.h create mode 100644 fs/xfs/xfs_dir2_block.c create mode 100644 fs/xfs/xfs_dir2_block.h create mode 100644 fs/xfs/xfs_dir2_data.c create mode 100644 fs/xfs/xfs_dir2_data.h create mode 100644 fs/xfs/xfs_dir2_leaf.c create mode 100644 fs/xfs/xfs_dir2_leaf.h create mode 100644 fs/xfs/xfs_dir2_node.c create mode 100644 fs/xfs/xfs_dir2_node.h create mode 100644 fs/xfs/xfs_dir2_sf.c create mode 100644 fs/xfs/xfs_dir2_sf.h create mode 100644 fs/xfs/xfs_dir2_trace.c create mode 100644 fs/xfs/xfs_dir2_trace.h create mode 100644 fs/xfs/xfs_dir_leaf.c create mode 100644 fs/xfs/xfs_dir_leaf.h create mode 100644 fs/xfs/xfs_dir_sf.h create mode 100644 fs/xfs/xfs_dmapi.h create mode 100644 fs/xfs/xfs_dmops.c create mode 100644 fs/xfs/xfs_error.c create mode 100644 fs/xfs/xfs_error.h create mode 100644 fs/xfs/xfs_extfree_item.c create mode 100644 fs/xfs/xfs_extfree_item.h create mode 100644 fs/xfs/xfs_fs.h create mode 100644 fs/xfs/xfs_fsops.c create mode 100644 fs/xfs/xfs_fsops.h create mode 100644 fs/xfs/xfs_ialloc.c create mode 100644 fs/xfs/xfs_ialloc.h create mode 100644 fs/xfs/xfs_ialloc_btree.c create mode 100644 fs/xfs/xfs_ialloc_btree.h create mode 100644 fs/xfs/xfs_iget.c create mode 100644 fs/xfs/xfs_imap.h create mode 100644 fs/xfs/xfs_inode.c create mode 100644 fs/xfs/xfs_inode.h create mode 100644 fs/xfs/xfs_inode_item.c create mode 100644 fs/xfs/xfs_inode_item.h create mode 100644 fs/xfs/xfs_inum.h create mode 100644 fs/xfs/xfs_iocore.c create mode 100644 fs/xfs/xfs_iomap.c create mode 100644 fs/xfs/xfs_iomap.h create mode 100644 fs/xfs/xfs_itable.c create mode 100644 fs/xfs/xfs_itable.h create mode 100644 fs/xfs/xfs_log.c create mode 100644 fs/xfs/xfs_log.h create mode 100644 fs/xfs/xfs_log_priv.h create mode 100644 fs/xfs/xfs_log_recover.c create mode 100644 fs/xfs/xfs_log_recover.h create mode 100644 fs/xfs/xfs_mac.h create mode 100644 fs/xfs/xfs_macros.c create mode 100644 fs/xfs/xfs_macros.h create mode 100644 fs/xfs/xfs_mount.c create mode 100644 fs/xfs/xfs_mount.h create mode 100644 fs/xfs/xfs_qmops.c create mode 100644 fs/xfs/xfs_quota.h create mode 100644 fs/xfs/xfs_refcache.h create mode 100644 fs/xfs/xfs_rename.c create mode 100644 fs/xfs/xfs_rtalloc.c create mode 100644 fs/xfs/xfs_rtalloc.h create mode 100644 fs/xfs/xfs_rw.c create mode 100644 fs/xfs/xfs_rw.h create mode 100644 fs/xfs/xfs_sb.h create mode 100644 fs/xfs/xfs_trans.c create mode 100644 fs/xfs/xfs_trans.h create mode 100644 fs/xfs/xfs_trans_ail.c create mode 100644 fs/xfs/xfs_trans_buf.c create mode 100644 fs/xfs/xfs_trans_extfree.c create mode 100644 fs/xfs/xfs_trans_inode.c create mode 100644 fs/xfs/xfs_trans_item.c create mode 100644 fs/xfs/xfs_trans_priv.h create mode 100644 fs/xfs/xfs_trans_space.h create mode 100644 fs/xfs/xfs_types.h create mode 100644 fs/xfs/xfs_utils.c create mode 100644 fs/xfs/xfs_utils.h create mode 100644 fs/xfs/xfs_vfsops.c create mode 100644 fs/xfs/xfs_vnodeops.c (limited to 'fs/xfs') diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig new file mode 100644 index 00000000000..c92306f0fdc --- /dev/null +++ b/fs/xfs/Kconfig @@ -0,0 +1,85 @@ +menu "XFS support" + +config XFS_FS + tristate "XFS filesystem support" + select EXPORTFS if NFSD!=n + help + XFS is a high performance journaling filesystem which originated + on the SGI IRIX platform. It is completely multi-threaded, can + support large files and large filesystems, extended attributes, + variable block sizes, is extent based, and makes extensive use of + Btrees (directories, extents, free space) to aid both performance + and scalability. + + Refer to the documentation at + for complete details. This implementation is on-disk compatible + with the IRIX version of XFS. + + To compile this file system support as a module, choose M here: the + module will be called xfs. Be aware, however, that if the file + system of your root partition is compiled as a module, you'll need + to use an initial ramdisk (initrd) to boot. + +config XFS_EXPORT + bool + default y if XFS_FS && EXPORTFS + +config XFS_RT + bool "Realtime support (EXPERIMENTAL)" + depends on XFS_FS && EXPERIMENTAL + help + If you say Y here you will be able to mount and use XFS filesystems + which contain a realtime subvolume. The realtime subvolume is a + separate area of disk space where only file data is stored. The + realtime subvolume is designed to provide very deterministic + data rates suitable for media streaming applications. + + See the xfs man page in section 5 for a bit more information. + + This feature is unsupported at this time, is not yet fully + functional, and may cause serious problems. + + If unsure, say N. + +config XFS_QUOTA + bool "Quota support" + depends on XFS_FS + help + If you say Y here, you will be able to set limits for disk usage on + a per user and/or a per group basis under XFS. XFS considers quota + information as filesystem metadata and uses journaling to provide a + higher level guarantee of consistency. The on-disk data format for + quota is also compatible with the IRIX version of XFS, allowing a + filesystem to be migrated between Linux and IRIX without any need + for conversion. + + If unsure, say N. More comprehensive documentation can be found in + README.quota in the xfsprogs package. XFS quota can be used either + with or without the generic quota support enabled (CONFIG_QUOTA) - + they are completely independent subsystems. + +config XFS_SECURITY + bool "Security Label support" + depends on XFS_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute namespace for inode security + labels in the XFS filesystem. + + If you are not using a security module that requires using + extended attributes for inode security labels, say N. + +config XFS_POSIX_ACL + bool "POSIX ACL support" + depends on XFS_FS + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +endmenu diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile new file mode 100644 index 00000000000..554e4a18c15 --- /dev/null +++ b/fs/xfs/Makefile @@ -0,0 +1,150 @@ +# +# Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write the Free Software Foundation, Inc., 59 +# Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, +# Mountain View, CA 94043, or: +# +# http://www.sgi.com +# +# For further information regarding this notice, see: +# +# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ +# + +EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char + +ifeq ($(CONFIG_XFS_DEBUG),y) + EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG + EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING +endif +ifeq ($(CONFIG_XFS_TRACE),y) + EXTRA_CFLAGS += -DXFS_ALLOC_TRACE + EXTRA_CFLAGS += -DXFS_ATTR_TRACE + EXTRA_CFLAGS += -DXFS_BLI_TRACE + EXTRA_CFLAGS += -DXFS_BMAP_TRACE + EXTRA_CFLAGS += -DXFS_BMBT_TRACE + EXTRA_CFLAGS += -DXFS_DIR_TRACE + EXTRA_CFLAGS += -DXFS_DIR2_TRACE + EXTRA_CFLAGS += -DXFS_DQUOT_TRACE + EXTRA_CFLAGS += -DXFS_ILOCK_TRACE + EXTRA_CFLAGS += -DXFS_LOG_TRACE + EXTRA_CFLAGS += -DXFS_RW_TRACE + EXTRA_CFLAGS += -DPAGEBUF_TRACE + # EXTRA_CFLAGS += -DXFS_VNODE_TRACE +endif + +obj-$(CONFIG_XFS_FS) += xfs.o + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ + xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o) +ifeq ($(CONFIG_XFS_QUOTA),y) +xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o +endif + +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += linux-2.6/xfs_stats.o +xfs-$(CONFIG_SYSCTL) += linux-2.6/xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += linux-2.6/xfs_ioctl32.o +xfs-$(CONFIG_XFS_EXPORT) += linux-2.6/xfs_export.o + + +xfs-y += xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_behavior.o \ + xfs_bit.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_buf_item.o \ + xfs_da_btree.o \ + xfs_dir.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dir_leaf.o \ + xfs_error.o \ + xfs_extfree_item.o \ + xfs_fsops.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_iget.o \ + xfs_inode.o \ + xfs_inode_item.o \ + xfs_iocore.o \ + xfs_iomap.o \ + xfs_itable.o \ + xfs_dfrag.o \ + xfs_log.o \ + xfs_log_recover.o \ + xfs_macros.o \ + xfs_mount.o \ + xfs_rename.o \ + xfs_trans.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + xfs_trans_item.o \ + xfs_utils.o \ + xfs_vfsops.o \ + xfs_vnodeops.o \ + xfs_rw.o \ + xfs_dmops.o \ + xfs_qmops.o + +xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o + +# Objects in linux-2.6/ +xfs-y += $(addprefix linux-2.6/, \ + kmem.o \ + xfs_aops.o \ + xfs_buf.o \ + xfs_file.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_ioctl.o \ + xfs_iops.o \ + xfs_lrw.o \ + xfs_super.o \ + xfs_vfs.o \ + xfs_vnode.o) + +# Objects in support/ +xfs-y += $(addprefix support/, \ + debug.o \ + move.o \ + qsort.o \ + uuid.o) + +xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o + diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c new file mode 100644 index 00000000000..364ea8c386b --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include +#include + +#include "time.h" +#include "kmem.h" + +#define MAX_VMALLOCS 6 +#define MAX_SLAB_SIZE 0x20000 + + +void * +kmem_alloc(size_t size, int flags) +{ + int retries = 0; + int lflags = kmem_flags_convert(flags); + void *ptr; + + do { + if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) + ptr = kmalloc(size, lflags); + else + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + printk(KERN_ERR "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, lflags); + blk_congestion_wait(WRITE, HZ/50); + } while (1); +} + +void * +kmem_zalloc(size_t size, int flags) +{ + void *ptr; + + ptr = kmem_alloc(size, flags); + if (ptr) + memset((char *)ptr, 0, (int)size); + return ptr; +} + +void +kmem_free(void *ptr, size_t size) +{ + if (((unsigned long)ptr < VMALLOC_START) || + ((unsigned long)ptr >= VMALLOC_END)) { + kfree(ptr); + } else { + vfree(ptr); + } +} + +void * +kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr, oldsize); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, int flags) +{ + int retries = 0; + int lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + printk(KERN_ERR "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, lflags); + blk_congestion_wait(WRITE, HZ/50); + } while (1); +} + +void * +kmem_zone_zalloc(kmem_zone_t *zone, int flags) +{ + void *ptr; + + ptr = kmem_zone_alloc(zone, flags); + if (ptr) + memset((char *)ptr, 0, kmem_cache_size(zone)); + return ptr; +} diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h new file mode 100644 index 00000000000..1397b669b05 --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include +#include +#include + +/* + * memory management routines + */ +#define KM_SLEEP 0x0001 +#define KM_NOSLEEP 0x0002 +#define KM_NOFS 0x0004 +#define KM_MAYFAIL 0x0008 + +#define kmem_zone kmem_cache_s +#define kmem_zone_t kmem_cache_t + +typedef unsigned long xfs_pflags_t; + +#define PFLAGS_TEST_NOIO() (current->flags & PF_NOIO) +#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS) + +#define PFLAGS_SET_NOIO() do { \ + current->flags |= PF_NOIO; \ +} while (0) + +#define PFLAGS_CLEAR_NOIO() do { \ + current->flags &= ~PF_NOIO; \ +} while (0) + +/* these could be nested, so we save state */ +#define PFLAGS_SET_FSTRANS(STATEP) do { \ + *(STATEP) = current->flags; \ + current->flags |= PF_FSTRANS; \ +} while (0) + +#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \ + *(STATEP) = current->flags; \ + current->flags &= ~PF_FSTRANS; \ +} while (0) + +/* Restore the PF_FSTRANS state to what was saved in STATEP */ +#define PFLAGS_RESTORE_FSTRANS(STATEP) do { \ + current->flags = ((current->flags & ~PF_FSTRANS) | \ + (*(STATEP) & PF_FSTRANS)); \ +} while (0) + +#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \ + *(NSTATEP) = *(OSTATEP); \ +} while (0) + +static __inline unsigned int kmem_flags_convert(int flags) +{ + int lflags = __GFP_NOWARN; /* we'll report problems, if need be */ + +#ifdef DEBUG + if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) { + printk(KERN_WARNING + "XFS: memory allocation with wrong flags (%x)\n", flags); + BUG(); + } +#endif + + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC; + } else { + lflags |= GFP_KERNEL; + + /* avoid recusive callbacks to filesystem during transactions */ + if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + } + + return lflags; +} + +static __inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); +} + +static __inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +static __inline void +kmem_zone_destroy(kmem_zone_t *zone) +{ + if (zone && kmem_cache_destroy(zone)) + BUG(); +} + +extern void *kmem_zone_zalloc(kmem_zone_t *, int); +extern void *kmem_zone_alloc(kmem_zone_t *, int); + +extern void *kmem_alloc(size_t, int); +extern void *kmem_realloc(void *, size_t, size_t, int); +extern void *kmem_zalloc(size_t, int); +extern void kmem_free(void *, size_t); + +typedef struct shrinker *kmem_shaker_t; +typedef int (*kmem_shake_func_t)(int, unsigned int); + +static __inline kmem_shaker_t +kmem_shake_register(kmem_shake_func_t sfunc) +{ + return set_shrinker(DEFAULT_SEEKS, sfunc); +} + +static __inline void +kmem_shake_deregister(kmem_shaker_t shrinker) +{ + remove_shrinker(shrinker); +} + +static __inline int +kmem_shake_allow(unsigned int gfp_mask) +{ + return (gfp_mask & __GFP_WAIT); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h new file mode 100644 index 00000000000..d2c11a098ff --- /dev/null +++ b/fs/xfs/linux-2.6/mrlock.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include + +enum { MR_NONE, MR_ACCESS, MR_UPDATE }; + +typedef struct { + struct rw_semaphore mr_lock; + int mr_writer; +} mrlock_t; + +#define mrinit(mrp, name) \ + ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) ) +#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) +#define mrfree(mrp) do { } while (0) +#define mraccess(mrp) mraccessf(mrp, 0) +#define mrupdate(mrp) mrupdatef(mrp, 0) + +static inline void mraccessf(mrlock_t *mrp, int flags) +{ + down_read(&mrp->mr_lock); +} + +static inline void mrupdatef(mrlock_t *mrp, int flags) +{ + down_write(&mrp->mr_lock); + mrp->mr_writer = 1; +} + +static inline int mrtryaccess(mrlock_t *mrp) +{ + return down_read_trylock(&mrp->mr_lock); +} + +static inline int mrtryupdate(mrlock_t *mrp) +{ + if (!down_write_trylock(&mrp->mr_lock)) + return 0; + mrp->mr_writer = 1; + return 1; +} + +static inline void mrunlock(mrlock_t *mrp) +{ + if (mrp->mr_writer) { + mrp->mr_writer = 0; + up_write(&mrp->mr_lock); + } else { + up_read(&mrp->mr_lock); + } +} + +static inline void mrdemote(mrlock_t *mrp) +{ + mrp->mr_writer = 0; + downgrade_write(&mrp->mr_lock); +} + +#ifdef DEBUG +/* + * Debug-only routine, without some platform-specific asm code, we can + * now only answer requests regarding whether we hold the lock for write + * (reader state is outside our visibility, we only track writer state). + * Note: means !ismrlocked would give false positivies, so don't do that. + */ +static inline int ismrlocked(mrlock_t *mrp, int type) +{ + if (mrp && type == MR_UPDATE) + return mrp->mr_writer; + return 1; +} +#endif + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h new file mode 100644 index 00000000000..0b296bb944c --- /dev/null +++ b/fs/xfs/linux-2.6/mutex.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MUTEX_H__ +#define __XFS_SUPPORT_MUTEX_H__ + +#include +#include + +/* + * Map the mutex'es from IRIX to Linux semaphores. + * + * Destroy just simply initializes to -99 which should block all other + * callers. + */ +#define MUTEX_DEFAULT 0x0 +typedef struct semaphore mutex_t; + +#define mutex_init(lock, type, name) sema_init(lock, 1) +#define mutex_destroy(lock) sema_init(lock, -99) +#define mutex_lock(lock, num) down(lock) +#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) +#define mutex_unlock(lock) up(lock) + +#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h new file mode 100644 index 00000000000..30b67b4e1cb --- /dev/null +++ b/fs/xfs/linux-2.6/sema.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SEMA_H__ +#define __XFS_SUPPORT_SEMA_H__ + +#include +#include +#include +#include + +/* + * sema_t structure just maps to struct semaphore in Linux kernel. + */ + +typedef struct semaphore sema_t; + +#define init_sema(sp, val, c, d) sema_init(sp, val) +#define initsema(sp, val) sema_init(sp, val) +#define initnsema(sp, val, name) sema_init(sp, val) +#define psema(sp, b) down(sp) +#define vsema(sp) up(sp) +#define valusema(sp) (atomic_read(&(sp)->count)) +#define freesema(sema) + +/* + * Map cpsema (try to get the sema) to down_trylock. We need to switch + * the return values since cpsema returns 1 (acquired) 0 (failed) and + * down_trylock returns the reverse 0 (acquired) 1 (failed). + */ + +#define cpsema(sp) (down_trylock(sp) ? 0 : 1) + +/* + * Didn't do cvsema(sp). Not sure how to map this to up/down/... + * It does a vsema if the values is < 0 other wise nothing. + */ + +#endif /* __XFS_SUPPORT_SEMA_H__ */ diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h new file mode 100644 index 00000000000..bcf60a0b8df --- /dev/null +++ b/fs/xfs/linux-2.6/spin.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SPIN_H__ +#define __XFS_SUPPORT_SPIN_H__ + +#include /* preempt needs this */ +#include + +/* + * Map lock_t from IRIX to Linux spinlocks. + * + * We do not make use of lock_t from interrupt context, so we do not + * have to worry about disabling interrupts at all (unlike IRIX). + */ + +typedef spinlock_t lock_t; + +#define SPLDECL(s) unsigned long s + +#define spinlock_init(lock, name) spin_lock_init(lock) +#define spinlock_destroy(lock) +#define mutex_spinlock(lock) ({ spin_lock(lock); 0; }) +#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0) +#define nested_spinlock(lock) spin_lock(lock) +#define nested_spinunlock(lock) spin_unlock(lock) + +#endif /* __XFS_SUPPORT_SPIN_H__ */ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h new file mode 100644 index 00000000000..821d3167e05 --- /dev/null +++ b/fs/xfs/linux-2.6/sv.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SV_H__ +#define __XFS_SUPPORT_SV_H__ + +#include +#include +#include + +/* + * Synchronisation variables. + * + * (Parameters "pri", "svf" and "rts" are not implemented) + */ + +typedef struct sv_s { + wait_queue_head_t waiters; +} sv_t; + +#define SV_FIFO 0x0 /* sv_t is FIFO type */ +#define SV_LIFO 0x2 /* sv_t is LIFO type */ +#define SV_PRIO 0x4 /* sv_t is PRIO type */ +#define SV_KEYED 0x6 /* sv_t is KEYED type */ +#define SV_DEFAULT SV_FIFO + + +static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, + unsigned long timeout) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&sv->waiters, &wait); + __set_current_state(state); + spin_unlock(lock); + + schedule_timeout(timeout); + + remove_wait_queue(&sv->waiters, &wait); +} + +#define init_sv(sv,type,name,flag) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_init(sv,flag,name) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_destroy(sv) \ + /*NOTHING*/ +#define sv_wait(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_wait_sig(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_signal(sv) \ + wake_up(&(sv)->waiters) +#define sv_broadcast(sv) \ + wake_up_all(&(sv)->waiters) + +#endif /* __XFS_SUPPORT_SV_H__ */ diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h new file mode 100644 index 00000000000..6c6fd0faa8e --- /dev/null +++ b/fs/xfs/linux-2.6/time.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_TIME_H__ +#define __XFS_SUPPORT_TIME_H__ + +#include +#include + +typedef struct timespec timespec_t; + +static inline void delay(long ticks) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} + +static inline void nanotime(struct timespec *tvp) +{ + *tvp = CURRENT_TIME; +} + +#endif /* __XFS_SUPPORT_TIME_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c new file mode 100644 index 00000000000..76a84758073 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_trans.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_iomap.h" +#include +#include + +STATIC void xfs_count_page_state(struct page *, int *, int *, int *); +STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *, + struct writeback_control *wbc, void *, int, int); + +#if defined(XFS_RW_TRACE) +void +xfs_page_trace( + int tag, + struct inode *inode, + struct page *page, + int mask) +{ + xfs_inode_t *ip; + bhv_desc_t *bdp; + vnode_t *vp = LINVFS_GET_VP(inode); + loff_t isize = i_size_read(inode); + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + int delalloc = -1, unmapped = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + + bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); + ip = XFS_BHVTOI(bdp); + if (!ip->i_rwtrace) + return; + + ktrace_enter(ip->i_rwtrace, + (void *)((unsigned long)tag), + (void *)ip, + (void *)inode, + (void *)page, + (void *)((unsigned long)mask), + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)((unsigned long)((isize >> 32) & 0xffffffff)), + (void *)((unsigned long)(isize & 0xffffffff)), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)delalloc), + (void *)((unsigned long)unmapped), + (void *)((unsigned long)unwritten), + (void *)NULL, + (void *)NULL); +} +#else +#define xfs_page_trace(tag, inode, page, mask) +#endif + +void +linvfs_unwritten_done( + struct buffer_head *bh, + int uptodate) +{ + xfs_buf_t *pb = (xfs_buf_t *)bh->b_private; + + ASSERT(buffer_unwritten(bh)); + bh->b_end_io = NULL; + clear_buffer_unwritten(bh); + if (!uptodate) + pagebuf_ioerror(pb, EIO); + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pagebuf_iodone(pb, 1, 1); + } + end_buffer_async_write(bh, uptodate); +} + +/* + * Issue transactions to convert a buffer range from unwritten + * to written extents (buffered IO). + */ +STATIC void +linvfs_unwritten_convert( + xfs_buf_t *bp) +{ + vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *); + int error; + + BUG_ON(atomic_read(&bp->pb_hold) < 1); + VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp), + BMAPI_UNWRITTEN, NULL, NULL, error); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_UNDATAIO(bp); + iput(LINVFS_GET_IP(vp)); + pagebuf_iodone(bp, 0, 0); +} + +/* + * Issue transactions to convert a buffer range from unwritten + * to written extents (direct IO). + */ +STATIC void +linvfs_unwritten_convert_direct( + struct inode *inode, + loff_t offset, + ssize_t size, + void *private) +{ + ASSERT(!private || inode == (struct inode *)private); + + /* private indicates an unwritten extent lay beneath this IO */ + if (private && size > 0) { + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); + } +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + ssize_t count, + xfs_iomap_t *mapp, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error, nmaps = 1; + + VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error); + if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) + VMODIFY(vp); + return -error; +} + +/* + * Finds the corresponding mapping in block @map array of the + * given @offset within a @page. + */ +STATIC xfs_iomap_t * +xfs_offset_to_map( + struct page *page, + xfs_iomap_t *iomapp, + unsigned long offset) +{ + loff_t full_offset; /* offset from start of file */ + + ASSERT(offset < PAGE_CACHE_SIZE); + + full_offset = page->index; /* NB: using 64bit number */ + full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */ + full_offset += offset; /* offset from page start */ + + if (full_offset < iomapp->iomap_offset) + return NULL; + if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset) + return iomapp; + return NULL; +} + +STATIC void +xfs_map_at_offset( + struct page *page, + struct buffer_head *bh, + unsigned long offset, + int block_bits, + xfs_iomap_t *iomapp) +{ + xfs_daddr_t bn; + loff_t delta; + int sector_shift; + + ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); + ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL); + + delta = page->index; + delta <<= PAGE_CACHE_SHIFT; + delta += offset; + delta -= iomapp->iomap_offset; + delta >>= block_bits; + + sector_shift = block_bits - BBSHIFT; + bn = iomapp->iomap_bn >> sector_shift; + bn += delta; + BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME)); + ASSERT((bn << sector_shift) >= iomapp->iomap_bn); + + lock_buffer(bh); + bh->b_blocknr = bn; + bh->b_bdev = iomapp->iomap_target->pbr_bdev; + set_buffer_mapped(bh); + clear_buffer_delay(bh); +} + +/* + * Look for a page at index which is unlocked and contains our + * unwritten extent flagged buffers at its head. Returns page + * locked and with an extra reference count, and length of the + * unwritten extent component on this page that we can write, + * in units of filesystem blocks. + */ +STATIC struct page * +xfs_probe_unwritten_page( + struct address_space *mapping, + pgoff_t index, + xfs_iomap_t *iomapp, + xfs_buf_t *pb, + unsigned long max_offset, + unsigned long *fsbs, + unsigned int bbits) +{ + struct page *page; + + page = find_trylock_page(mapping, index); + if (!page) + return NULL; + if (PageWriteback(page)) + goto out; + + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + unsigned long p_offset = 0; + + *fsbs = 0; + bh = head = page_buffers(page); + do { + if (!buffer_unwritten(bh) || !buffer_uptodate(bh)) + break; + if (!xfs_offset_to_map(page, iomapp, p_offset)) + break; + if (p_offset >= max_offset) + break; + xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); + set_buffer_unwritten_io(bh); + bh->b_private = pb; + p_offset += bh->b_size; + (*fsbs)++; + } while ((bh = bh->b_this_page) != head); + + if (p_offset) + return page; + } + +out: + unlock_page(page); + return NULL; +} + +/* + * Look for a page at index which is unlocked and not mapped + * yet - clustering for mmap write case. + */ +STATIC unsigned int +xfs_probe_unmapped_page( + struct address_space *mapping, + pgoff_t index, + unsigned int pg_offset) +{ + struct page *page; + int ret = 0; + + page = find_trylock_page(mapping, index); + if (!page) + return 0; + if (PageWriteback(page)) + goto out; + + if (page->mapping && PageDirty(page)) { + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_mapped(bh) || !buffer_uptodate(bh)) + break; + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); + } else + ret = PAGE_CACHE_SIZE; + } + +out: + unlock_page(page); + return ret; +} + +STATIC unsigned int +xfs_probe_unmapped_cluster( + struct inode *inode, + struct page *startpage, + struct buffer_head *bh, + struct buffer_head *head) +{ + pgoff_t tindex, tlast, tloff; + unsigned int pg_offset, len, total = 0; + struct address_space *mapping = inode->i_mapping; + + /* First sum forwards in this page */ + do { + if (buffer_mapped(bh)) + break; + total += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + /* If we reached the end of the page, sum forwards in + * following pages. + */ + if (bh == head) { + tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; + /* Prune this back to avoid pathological behavior */ + tloff = min(tlast, startpage->index + 64); + for (tindex = startpage->index + 1; tindex < tloff; tindex++) { + len = xfs_probe_unmapped_page(mapping, tindex, + PAGE_CACHE_SIZE); + if (!len) + return total; + total += len; + } + if (tindex == tlast && + (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + total += xfs_probe_unmapped_page(mapping, + tindex, pg_offset); + } + } + return total; +} + +/* + * Probe for a given page (index) in the inode and test if it is delayed + * and without unwritten buffers. Returns page locked and with an extra + * reference count. + */ +STATIC struct page * +xfs_probe_delalloc_page( + struct inode *inode, + pgoff_t index) +{ + struct page *page; + + page = find_trylock_page(inode->i_mapping, index); + if (!page) + return NULL; + if (PageWriteback(page)) + goto out; + + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + int acceptable = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + acceptable = 0; + break; + } else if (buffer_delay(bh)) { + acceptable = 1; + } + } while ((bh = bh->b_this_page) != head); + + if (acceptable) + return page; + } + +out: + unlock_page(page); + return NULL; +} + +STATIC int +xfs_map_unwritten( + struct inode *inode, + struct page *start_page, + struct buffer_head *head, + struct buffer_head *curr, + unsigned long p_offset, + int block_bits, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + int startio, + int all_bh) +{ + struct buffer_head *bh = curr; + xfs_iomap_t *tmp; + xfs_buf_t *pb; + loff_t offset, size; + unsigned long nblocks = 0; + + offset = start_page->index; + offset <<= PAGE_CACHE_SHIFT; + offset += p_offset; + + /* get an "empty" pagebuf to manage IO completion + * Proper values will be set before returning */ + pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0); + if (!pb) + return -EAGAIN; + + /* Take a reference to the inode to prevent it from + * being reclaimed while we have outstanding unwritten + * extent IO on it. + */ + if ((igrab(inode)) != inode) { + pagebuf_free(pb); + return -EAGAIN; + } + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling pagebuf_iodone too early. + */ + atomic_set(&pb->pb_io_remaining, 1); + + /* First map forwards in the page consecutive buffers + * covering this unwritten extent + */ + do { + if (!buffer_unwritten(bh)) + break; + tmp = xfs_offset_to_map(start_page, iomapp, p_offset); + if (!tmp) + break; + xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); + set_buffer_unwritten_io(bh); + bh->b_private = pb; + p_offset += bh->b_size; + nblocks++; + } while ((bh = bh->b_this_page) != head); + + atomic_add(nblocks, &pb->pb_io_remaining); + + /* If we reached the end of the page, map forwards in any + * following pages which are also covered by this extent. + */ + if (bh == head) { + struct address_space *mapping = inode->i_mapping; + pgoff_t tindex, tloff, tlast; + unsigned long bs; + unsigned int pg_offset, bbits = inode->i_blkbits; + struct page *page; + + tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; + tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT; + tloff = min(tlast, tloff); + for (tindex = start_page->index + 1; tindex < tloff; tindex++) { + page = xfs_probe_unwritten_page(mapping, + tindex, iomapp, pb, + PAGE_CACHE_SIZE, &bs, bbits); + if (!page) + break; + nblocks += bs; + atomic_add(bs, &pb->pb_io_remaining); + xfs_convert_page(inode, page, iomapp, wbc, pb, + startio, all_bh); + /* stop if converting the next page might add + * enough blocks that the corresponding byte + * count won't fit in our ulong page buf length */ + if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) + goto enough; + } + + if (tindex == tlast && + (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { + page = xfs_probe_unwritten_page(mapping, + tindex, iomapp, pb, + pg_offset, &bs, bbits); + if (page) { + nblocks += bs; + atomic_add(bs, &pb->pb_io_remaining); + xfs_convert_page(inode, page, iomapp, wbc, pb, + startio, all_bh); + if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) + goto enough; + } + } + } + +enough: + size = nblocks; /* NB: using 64bit number here */ + size <<= block_bits; /* convert fsb's to byte range */ + + XFS_BUF_DATAIO(pb); + XFS_BUF_ASYNC(pb); + XFS_BUF_SET_SIZE(pb, size); + XFS_BUF_SET_COUNT(pb, size); + XFS_BUF_SET_OFFSET(pb, offset); + XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode)); + XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert); + + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pagebuf_iodone(pb, 1, 1); + } + + return 0; +} + +STATIC void +xfs_submit_page( + struct page *page, + struct writeback_control *wbc, + struct buffer_head *bh_arr[], + int bh_count, + int probed_page, + int clear_dirty) +{ + struct buffer_head *bh; + int i; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + if (clear_dirty) + clear_page_dirty(page); + unlock_page(page); + + if (bh_count) { + for (i = 0; i < bh_count; i++) { + bh = bh_arr[i]; + mark_buffer_async_write(bh); + if (buffer_unwritten(bh)) + set_buffer_unwritten_io(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + } + + for (i = 0; i < bh_count; i++) + submit_bh(WRITE, bh_arr[i]); + + if (probed_page && clear_dirty) + wbc->nr_to_write--; /* Wrote an "extra" page */ + } else { + end_page_writeback(page); + wbc->pages_skipped++; /* We didn't write this page */ + } +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC void +xfs_convert_page( + struct inode *inode, + struct page *page, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + void *private, + int startio, + int all_bh) +{ + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; + xfs_iomap_t *mp = iomapp, *tmp; + unsigned long end, offset; + pgoff_t end_index; + int i = 0, index = 0; + int bbits = inode->i_blkbits; + + end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; + if (page->index < end_index) { + end = PAGE_CACHE_SIZE; + } else { + end = i_size_read(inode) & (PAGE_CACHE_SIZE-1); + } + bh = head = page_buffers(page); + do { + offset = i << bbits; + if (offset >= end) + break; + if (!(PageUptodate(page) || buffer_uptodate(bh))) + continue; + if (buffer_mapped(bh) && all_bh && + !(buffer_unwritten(bh) || buffer_delay(bh))) { + if (startio) { + lock_buffer(bh); + bh_arr[index++] = bh; + } + continue; + } + tmp = xfs_offset_to_map(page, mp, offset); + if (!tmp) + continue; + ASSERT(!(tmp->iomap_flags & IOMAP_HOLE)); + ASSERT(!(tmp->iomap_flags & IOMAP_DELAY)); + + /* If this is a new unwritten extent buffer (i.e. one + * that we haven't passed in private data for, we must + * now map this buffer too. + */ + if (buffer_unwritten(bh) && !bh->b_end_io) { + ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN); + xfs_map_unwritten(inode, page, head, bh, offset, + bbits, tmp, wbc, startio, all_bh); + } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) { + xfs_map_at_offset(page, bh, offset, bbits, tmp); + if (buffer_unwritten(bh)) { + set_buffer_unwritten_io(bh); + bh->b_private = private; + ASSERT(private); + } + } + if (startio) { + bh_arr[index++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + } while (i++, (bh = bh->b_this_page) != head); + + if (startio) { + xfs_submit_page(page, wbc, bh_arr, index, 1, index == i); + } else { + unlock_page(page); + } +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + int startio, + int all_bh, + pgoff_t tlast) +{ + struct page *page; + + for (; tindex <= tlast; tindex++) { + page = xfs_probe_delalloc_page(inode, tindex); + if (!page) + break; + xfs_convert_page(inode, page, iomapp, wbc, NULL, + startio, all_bh); + } +} + +/* + * Calling this without startio set means we are being asked to make a dirty + * page ready for freeing it's buffers. When called with startio set then + * we are coming from writepage. + * + * When called with startio set it is important that we write the WHOLE + * page if possible. + * The bh->b_state's cannot know if any of the blocks or which block for + * that matter are dirty due to mmap writes, and therefore bh uptodate is + * only vaild if the page itself isn't completely uptodate. Some layers + * may clear the page dirty flag prior to calling write page, under the + * assumption the entire page will be written out; by not writing out the + * whole page the page can be reused before all valid dirty data is + * written out. Note: in the case of a page that has been dirty'd by + * mapwrite and but partially setup by block_prepare_write the + * bh->b_states's will not agree and only ones setup by BPW/BCW will have + * valid state, thus the whole page must be written out thing. + */ + +STATIC int +xfs_page_state_convert( + struct inode *inode, + struct page *page, + struct writeback_control *wbc, + int startio, + int unmapped) /* also implies page uptodate */ +{ + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; + xfs_iomap_t *iomp, iomap; + loff_t offset; + unsigned long p_offset = 0; + __uint64_t end_offset; + pgoff_t end_index, last_index, tlast; + int len, err, i, cnt = 0, uptodate = 1; + int flags = startio ? 0 : BMAPI_TRYLOCK; + int page_dirty, delalloc = 0; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + if (page->index >= end_index) { + if ((page->index >= end_index + 1) || + !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + err = -EIO; + goto error; + } + } + + offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + end_offset = min_t(unsigned long long, + offset + PAGE_CACHE_SIZE, i_size_read(inode)); + + bh = head = page_buffers(page); + iomp = NULL; + + /* + * page_dirty is initially a count of buffers on the page and + * is decrememted as we move each into a cleanable state. + */ + len = bh->b_size; + page_dirty = PAGE_CACHE_SIZE / len; + + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) + continue; + + if (iomp) { + iomp = xfs_offset_to_map(page, &iomap, p_offset); + } + + /* + * First case, map an unwritten extent and prepare for + * extent state conversion transaction on completion. + */ + if (buffer_unwritten(bh)) { + if (!startio) + continue; + if (!iomp) { + err = xfs_map_blocks(inode, offset, len, &iomap, + BMAPI_READ|BMAPI_IGNSTATE); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + if (!bh->b_end_io) { + err = xfs_map_unwritten(inode, page, + head, bh, p_offset, + inode->i_blkbits, iomp, + wbc, startio, unmapped); + if (err) { + goto error; + } + } else { + set_bit(BH_Lock, &bh->b_state); + } + BUG_ON(!buffer_locked(bh)); + bh_arr[cnt++] = bh; + page_dirty--; + } + /* + * Second case, allocate space for a delalloc buffer. + * We can return EAGAIN here in the release page case. + */ + } else if (buffer_delay(bh)) { + if (!iomp) { + delalloc = 1; + err = xfs_map_blocks(inode, offset, len, &iomap, + BMAPI_ALLOCATE | flags); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + xfs_map_at_offset(page, bh, p_offset, + inode->i_blkbits, iomp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + page_dirty--; + } + } else if ((buffer_uptodate(bh) || PageUptodate(page)) && + (unmapped || startio)) { + + if (!buffer_mapped(bh)) { + int size; + + /* + * Getting here implies an unmapped buffer + * was found, and we are in a path where we + * need to write the whole page out. + */ + if (!iomp) { + size = xfs_probe_unmapped_cluster( + inode, page, bh, head); + err = xfs_map_blocks(inode, offset, + size, &iomap, + BMAPI_WRITE|BMAPI_MMAP); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + xfs_map_at_offset(page, + bh, p_offset, + inode->i_blkbits, iomp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + page_dirty--; + } + } else if (startio) { + if (buffer_uptodate(bh) && + !test_and_set_bit(BH_Lock, &bh->b_state)) { + bh_arr[cnt++] = bh; + page_dirty--; + } + } + } + } while (offset += len, p_offset += len, + ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (startio) + xfs_submit_page(page, wbc, bh_arr, cnt, 0, 1); + + if (iomp) { + tlast = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> + PAGE_CACHE_SHIFT; + if (delalloc && (tlast > last_index)) + tlast = last_index; + xfs_cluster_write(inode, page->index + 1, iomp, wbc, + startio, unmapped, tlast); + } + + return page_dirty; + +error: + for (i = 0; i < cnt; i++) { + unlock_buffer(bh_arr[i]); + } + + /* + * If it's delalloc and we have nowhere to put it, + * throw it away, unless the lower layers told + * us to try again. + */ + if (err != -EAGAIN) { + if (!unmapped) { + block_invalidatepage(page, 0); + } + ClearPageUptodate(page); + } + return err; +} + +STATIC int +__linvfs_get_block( + struct inode *inode, + sector_t iblock, + unsigned long blocks, + struct buffer_head *bh_result, + int create, + int direct, + bmapi_flags_t flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_iomap_t iomap; + int retpbbm = 1; + int error; + ssize_t size; + loff_t offset = (loff_t)iblock << inode->i_blkbits; + + if (blocks) + size = blocks << inode->i_blkbits; + else + size = 1 << inode->i_blkbits; + + VOP_BMAP(vp, offset, size, + create ? flags : BMAPI_READ, &iomap, &retpbbm, error); + if (error) + return -error; + + if (retpbbm == 0) + return 0; + + if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + xfs_daddr_t bn; + loff_t delta; + + /* For unwritten extents do not report a disk address on + * the read case (treat as if we're reading into a hole). + */ + if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { + delta = offset - iomap.iomap_offset; + delta >>= inode->i_blkbits; + + bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT); + bn += delta; + BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME)); + bh_result->b_blocknr = bn; + set_buffer_mapped(bh_result); + } + if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (direct) + bh_result->b_private = inode; + set_buffer_unwritten(bh_result); + set_buffer_delay(bh_result); + } + } + + /* If this is a realtime file, data might be on a new device */ + bh_result->b_bdev = iomap.iomap_target->pbr_bdev; + + /* If we previously allocated a block out beyond eof and + * we are now coming back to use it then we will need to + * flag it as new even if it has a disk address. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) { + set_buffer_new(bh_result); + } + + if (iomap.iomap_flags & IOMAP_DELAY) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + if (blocks) { + bh_result->b_size = (ssize_t)min( + (loff_t)(iomap.iomap_bsize - iomap.iomap_delta), + (loff_t)(blocks << inode->i_blkbits)); + } + + return 0; +} + +int +linvfs_get_block( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __linvfs_get_block(inode, iblock, 0, bh_result, + create, 0, BMAPI_WRITE); +} + +STATIC int +linvfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, + int create) +{ + return __linvfs_get_block(inode, iblock, max_blocks, bh_result, + create, 1, BMAPI_WRITE|BMAPI_DIRECT); +} + +STATIC ssize_t +linvfs_direct_IO( + int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_iomap_t iomap; + int maps = 1; + int error; + + VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error); + if (error) + return -error; + + return blockdev_direct_IO_own_locking(rw, iocb, inode, + iomap.iomap_target->pbr_bdev, + iov, offset, nr_segs, + linvfs_get_blocks_direct, + linvfs_unwritten_convert_direct); +} + + +STATIC sector_t +linvfs_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address); + + VOP_RWLOCK(vp, VRWLOCK_READ); + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); + VOP_RWUNLOCK(vp, VRWLOCK_READ); + return generic_block_bmap(mapping, block, linvfs_get_block); +} + +STATIC int +linvfs_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, linvfs_get_block); +} + +STATIC int +linvfs_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block); +} + +STATIC void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unmapped, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unmapped = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) && !buffer_mapped(bh)) + (*unmapped) = 1; + else if (buffer_unwritten(bh) && !buffer_delay(bh)) + clear_buffer_unwritten(bh); + else if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + + +/* + * writepage: Called from one of two places: + * + * 1. we are flushing a delalloc buffer head. + * + * 2. we are writing out a dirty page. Typically the page dirty + * state is cleared before we get here. In this case is it + * conceivable we have no buffer heads. + * + * For delalloc space on the page we need to allocate space and + * flush it. For unmapped buffer heads on the page we should + * allocate space if the page is uptodate. For any other dirty + * buffer heads on the page we should flush them. + * + * If we detect that a transaction would be required to flush + * the page, we have to check the process flags first, if we + * are already in a transaction or disk I/O during allocations + * is off, we need to fail the writepage and redirty the page. + */ + +STATIC int +linvfs_writepage( + struct page *page, + struct writeback_control *wbc) +{ + int error; + int need_trans; + int delalloc, unmapped, unwritten; + struct inode *inode = page->mapping->host; + + xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); + + /* + * We need a transaction if: + * 1. There are delalloc buffers on the page + * 2. The page is uptodate and we have unmapped buffers + * 3. The page is uptodate and we have no buffers + * 4. There are unwritten buffers on the page + */ + + if (!page_has_buffers(page)) { + unmapped = 1; + need_trans = 1; + } else { + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + if (!PageUptodate(page)) + unmapped = 0; + need_trans = delalloc + unmapped + unwritten; + } + + /* + * If we need a transaction and the process flags say + * we are already in a transaction, or no IO is allowed + * then mark the page dirty again and leave the page + * as is. + */ + if (PFLAGS_TEST_FSTRANS() && need_trans) + goto out_fail; + + /* + * Delay hooking up buffer heads until we have + * made our go/no-go decision. + */ + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * Convert delayed allocate, unwritten or unmapped space + * to real space and flush out to disk. + */ + error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); + if (error == -EAGAIN) + goto out_fail; + if (unlikely(error < 0)) + goto out_unlock; + + return 0; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +out_unlock: + unlock_page(page); + return error; +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. Possibly the page is already clean. We always + * have buffer heads in this call. + * + * Returns 0 if the page is ok to release, 1 otherwise. + * + * Possible scenarios are: + * + * 1. We are being called to release a page which has been written + * to via regular I/O. buffer heads will be dirty and possibly + * delalloc. If no delalloc buffer heads in this case then we + * can just return zero. + * + * 2. We are called to release a page which has been written via + * mmap, all we need to do is ensure there is no delalloc + * state in the buffer heads, if not we can let the caller + * free them and we should come back later via writepage. + */ +STATIC int +linvfs_release_page( + struct page *page, + int gfp_mask) +{ + struct inode *inode = page->mapping->host; + int dirty, delalloc, unmapped, unwritten; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask); + + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + if (!delalloc && !unwritten) + goto free_buffers; + + if (!(gfp_mask & __GFP_FS)) + return 0; + + /* If we are already inside a transaction or the thread cannot + * do I/O, we cannot release this page. + */ + if (PFLAGS_TEST_FSTRANS()) + return 0; + + /* + * Convert delalloc space to real space, do not flush the + * data out to disk, that will be done by the caller. + * Never need to allocate space here - we will always + * come back to writepage in that case. + */ + dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); + if (dirty == 0 && !unwritten) + goto free_buffers; + return 0; + +free_buffers: + return try_to_free_buffers(page); +} + +STATIC int +linvfs_prepare_write( + struct file *file, + struct page *page, + unsigned int from, + unsigned int to) +{ + return block_prepare_write(page, from, to, linvfs_get_block); +} + +struct address_space_operations linvfs_aops = { + .readpage = linvfs_readpage, + .readpages = linvfs_readpages, + .writepage = linvfs_writepage, + .sync_page = block_sync_page, + .releasepage = linvfs_release_page, + .prepare_write = linvfs_prepare_write, + .commit_write = generic_commit_write, + .bmap = linvfs_bmap, + .direct_IO = linvfs_direct_IO, +}; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c new file mode 100644 index 00000000000..23e0eb67fc2 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -0,0 +1,1980 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * The xfs_buf.c code provides an abstract buffer cache model on top + * of the Linux page cache. Cached metadata blocks for a file system + * are hashed to the inode for the block device. xfs_buf.c assembles + * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O. + * + * Written by Steve Lord, Jim Mostek, Russell Cattelan + * and Rajagopal Ananthanarayanan ("ananth") at SGI. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfs_linux.h" + +/* + * File wide globals + */ + +STATIC kmem_cache_t *pagebuf_cache; +STATIC kmem_shaker_t pagebuf_shake; +STATIC int pagebuf_daemon_wakeup(int, unsigned int); +STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); +STATIC struct workqueue_struct *pagebuf_logio_workqueue; +STATIC struct workqueue_struct *pagebuf_dataio_workqueue; + +/* + * Pagebuf debugging + */ + +#ifdef PAGEBUF_TRACE +void +pagebuf_trace( + xfs_buf_t *pb, + char *id, + void *data, + void *ra) +{ + ktrace_enter(pagebuf_trace_buf, + pb, id, + (void *)(unsigned long)pb->pb_flags, + (void *)(unsigned long)pb->pb_hold.counter, + (void *)(unsigned long)pb->pb_sema.count.counter, + (void *)current, + data, ra, + (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), + (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), + (void *)(unsigned long)pb->pb_buffer_length, + NULL, NULL, NULL, NULL, NULL); +} +ktrace_t *pagebuf_trace_buf; +#define PAGEBUF_TRACE_SIZE 4096 +#define PB_TRACE(pb, id, data) \ + pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) +#else +#define PB_TRACE(pb, id, data) do { } while (0) +#endif + +#ifdef PAGEBUF_LOCK_TRACKING +# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) +# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) +# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) +#else +# define PB_SET_OWNER(pb) do { } while (0) +# define PB_CLEAR_OWNER(pb) do { } while (0) +# define PB_GET_OWNER(pb) do { } while (0) +#endif + +/* + * Pagebuf allocation / freeing. + */ + +#define pb_to_gfp(flags) \ + ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ + ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) + +#define pb_to_km(flags) \ + (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + + +#define pagebuf_allocate(flags) \ + kmem_zone_alloc(pagebuf_cache, pb_to_km(flags)) +#define pagebuf_deallocate(pb) \ + kmem_zone_free(pagebuf_cache, (pb)); + +/* + * Page Region interfaces. + * + * For pages in filesystems where the blocksize is smaller than the + * pagesize, we use the page->private field (long) to hold a bitmap + * of uptodate regions within the page. + * + * Each such region is "bytes per page / bits per long" bytes long. + * + * NBPPR == number-of-bytes-per-page-region + * BTOPR == bytes-to-page-region (rounded up) + * BTOPRT == bytes-to-page-region-truncated (rounded down) + */ +#if (BITS_PER_LONG == 32) +#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ +#elif (BITS_PER_LONG == 64) +#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ +#else +#error BITS_PER_LONG must be 32 or 64 +#endif +#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) +#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) +#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) + +STATIC unsigned long +page_region_mask( + size_t offset, + size_t length) +{ + unsigned long mask; + int first, final; + + first = BTOPR(offset); + final = BTOPRT(offset + length - 1); + first = min(first, final); + + mask = ~0UL; + mask <<= BITS_PER_LONG - (final - first); + mask >>= BITS_PER_LONG - (final); + + ASSERT(offset + length <= PAGE_CACHE_SIZE); + ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); + + return mask; +} + +STATIC inline void +set_page_region( + struct page *page, + size_t offset, + size_t length) +{ + page->private |= page_region_mask(offset, length); + if (page->private == ~0UL) + SetPageUptodate(page); +} + +STATIC inline int +test_page_region( + struct page *page, + size_t offset, + size_t length) +{ + unsigned long mask = page_region_mask(offset, length); + + return (mask && (page->private & mask) == mask); +} + +/* + * Mapping of multi-page buffers into contiguous virtual space + */ + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +STATIC a_list_t *as_free_head; +STATIC int as_list_len; +STATIC DEFINE_SPINLOCK(as_lock); + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + if (likely(aentry)) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + +/* + * Internal pagebuf object manipulation + */ + +STATIC void +_pagebuf_initialize( + xfs_buf_t *pb, + xfs_buftarg_t *target, + loff_t range_base, + size_t range_length, + page_buf_flags_t flags) +{ + /* + * We don't want certain flags to appear in pb->pb_flags. + */ + flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); + + memset(pb, 0, sizeof(xfs_buf_t)); + atomic_set(&pb->pb_hold, 1); + init_MUTEX_LOCKED(&pb->pb_iodonesema); + INIT_LIST_HEAD(&pb->pb_list); + INIT_LIST_HEAD(&pb->pb_hash_list); + init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ + PB_SET_OWNER(pb); + pb->pb_target = target; + pb->pb_file_offset = range_base; + /* + * Set buffer_length and count_desired to the same value initially. + * I/O routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + pb->pb_buffer_length = pb->pb_count_desired = range_length; + pb->pb_flags = flags | PBF_NONE; + pb->pb_bn = XFS_BUF_DADDR_NULL; + atomic_set(&pb->pb_pin_count, 0); + init_waitqueue_head(&pb->pb_waiters); + + XFS_STATS_INC(pb_create); + PB_TRACE(pb, "initialize", target); +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_pagebuf_get_pages( + xfs_buf_t *pb, + int page_count, + page_buf_flags_t flags) +{ + /* Make sure that we have a page list */ + if (pb->pb_pages == NULL) { + pb->pb_offset = page_buf_poff(pb->pb_file_offset); + pb->pb_page_count = page_count; + if (page_count <= PB_PAGES) { + pb->pb_pages = pb->pb_page_array; + } else { + pb->pb_pages = kmem_alloc(sizeof(struct page *) * + page_count, pb_to_km(flags)); + if (pb->pb_pages == NULL) + return -ENOMEM; + } + memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees pb_pages if it was malloced. + */ +STATIC void +_pagebuf_free_pages( + xfs_buf_t *bp) +{ + if (bp->pb_pages != bp->pb_page_array) { + kmem_free(bp->pb_pages, + bp->pb_page_count * sizeof(struct page *)); + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer most not be on any hash - use pagebuf_rele instead for + * hashed and refcounted buffers + */ +void +pagebuf_free( + xfs_buf_t *bp) +{ + PB_TRACE(bp, "free", 0); + + ASSERT(list_empty(&bp->pb_hash_list)); + + if (bp->pb_flags & _PBF_PAGE_CACHE) { + uint i; + + if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) + free_address(bp->pb_addr - bp->pb_offset); + + for (i = 0; i < bp->pb_page_count; i++) + page_cache_release(bp->pb_pages[i]); + _pagebuf_free_pages(bp); + } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { + /* + * XXX(hch): bp->pb_count_desired might be incorrect (see + * pagebuf_associate_memory for details), but fortunately + * the Linux version of kmem_free ignores the len argument.. + */ + kmem_free(bp->pb_addr, bp->pb_count_desired); + _pagebuf_free_pages(bp); + } + + pagebuf_deallocate(bp); +} + +/* + * Finds all pages for buffer in question and builds it's page list. + */ +STATIC int +_pagebuf_lookup_pages( + xfs_buf_t *bp, + uint flags) +{ + struct address_space *mapping = bp->pb_target->pbr_mapping; + size_t blocksize = bp->pb_target->pbr_bsize; + size_t size = bp->pb_count_desired; + size_t nbytes, offset; + int gfp_mask = pb_to_gfp(flags); + unsigned short page_count, i; + pgoff_t first; + loff_t end; + int error; + + end = bp->pb_file_offset + bp->pb_buffer_length; + page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); + + error = _pagebuf_get_pages(bp, page_count, flags); + if (unlikely(error)) + return error; + bp->pb_flags |= _PBF_PAGE_CACHE; + + offset = bp->pb_offset; + first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bp->pb_page_count; i++) { + struct page *page; + uint retries = 0; + + retry: + page = find_or_create_page(mapping, first + i, gfp_mask); + if (unlikely(page == NULL)) { + if (flags & PBF_READ_AHEAD) { + bp->pb_page_count = i; + for (i = 0; i < bp->pb_page_count; i++) + unlock_page(bp->pb_pages[i]); + return -ENOMEM; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + printk(KERN_ERR + "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, gfp_mask); + + XFS_STATS_INC(pb_page_retries); + pagebuf_daemon_wakeup(0, gfp_mask); + blk_congestion_wait(WRITE, HZ/50); + goto retry; + } + + XFS_STATS_INC(pb_page_found); + + nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); + size -= nbytes; + + if (!PageUptodate(page)) { + page_count--; + if (blocksize >= PAGE_CACHE_SIZE) { + if (flags & PBF_READ) + bp->pb_locked = 1; + } else if (!PagePrivate(page)) { + if (test_page_region(page, offset, nbytes)) + page_count++; + } + } + + bp->pb_pages[i] = page; + offset = 0; + } + + if (!bp->pb_locked) { + for (i = 0; i < bp->pb_page_count; i++) + unlock_page(bp->pb_pages[i]); + } + + if (page_count) { + /* if we have any uptodate pages, mark that in the buffer */ + bp->pb_flags &= ~PBF_NONE; + + /* if some pages aren't uptodate, mark that in the buffer */ + if (page_count != bp->pb_page_count) + bp->pb_flags |= PBF_PARTIAL; + } + + PB_TRACE(bp, "lookup_pages", (long)page_count); + return error; +} + +/* + * Map buffer into kernel address-space if nessecary. + */ +STATIC int +_pagebuf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + /* A single page buffer is always mappable */ + if (bp->pb_page_count == 1) { + bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; + bp->pb_flags |= PBF_MAPPED; + } else if (flags & PBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); + bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, + VM_MAP, PAGE_KERNEL); + if (unlikely(bp->pb_addr == NULL)) + return -ENOMEM; + bp->pb_addr += bp->pb_offset; + bp->pb_flags |= PBF_MAPPED; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * _pagebuf_find + * + * Looks up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +xfs_buf_t * +_pagebuf_find( + xfs_buftarg_t *btp, /* block device target */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags, /* PBF_TRYLOCK */ + xfs_buf_t *new_pb)/* newly allocated buffer */ +{ + loff_t range_base; + size_t range_length; + xfs_bufhash_t *hash; + xfs_buf_t *pb, *n; + + range_base = (ioff << BBSHIFT); + range_length = (isize << BBSHIFT); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(range_length < (1 << btp->pbr_sshift))); + ASSERT(!(range_base & (loff_t)btp->pbr_smask)); + + hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; + + spin_lock(&hash->bh_lock); + + list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == pb->pb_target); + if (pb->pb_file_offset == range_base && + pb->pb_buffer_length == range_length) { + /* + * If we look at something bring it to the + * front of the list for next time. + */ + atomic_inc(&pb->pb_hold); + list_move(&pb->pb_hash_list, &hash->bh_list); + goto found; + } + } + + /* No match found */ + if (new_pb) { + _pagebuf_initialize(new_pb, btp, range_base, + range_length, flags); + new_pb->pb_hash = hash; + list_add(&new_pb->pb_hash_list, &hash->bh_list); + } else { + XFS_STATS_INC(pb_miss_locked); + } + + spin_unlock(&hash->bh_lock); + return new_pb; + +found: + spin_unlock(&hash->bh_lock); + + /* Attempt to get the semaphore without sleeping, + * if this does not work then we need to drop the + * spinlock and do a hard attempt on the semaphore. + */ + if (down_trylock(&pb->pb_sema)) { + if (!(flags & PBF_TRYLOCK)) { + /* wait for buffer ownership */ + PB_TRACE(pb, "get_lock", 0); + pagebuf_lock(pb); + XFS_STATS_INC(pb_get_locked_waited); + } else { + /* We asked for a trylock and failed, no need + * to look at file offset and length here, we + * know that this pagebuf at least overlaps our + * pagebuf and is locked, therefore our buffer + * either does not exist, or is this buffer + */ + + pagebuf_rele(pb); + XFS_STATS_INC(pb_busy_locked); + return (NULL); + } + } else { + /* trylock worked */ + PB_SET_OWNER(pb); + } + + if (pb->pb_flags & PBF_STALE) + pb->pb_flags &= PBF_MAPPED; + PB_TRACE(pb, "got_lock", 0); + XFS_STATS_INC(pb_get_locked); + return (pb); +} + +/* + * xfs_buf_get_flags assembles a buffer covering the specified range. + * + * Storage in memory for all portions of the buffer will be allocated, + * although backing storage may not be. + */ +xfs_buf_t * +xfs_buf_get_flags( /* allocate a buffer */ + xfs_buftarg_t *target,/* target for buffer */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + xfs_buf_t *pb, *new_pb; + int error = 0, i; + + new_pb = pagebuf_allocate(flags); + if (unlikely(!new_pb)) + return NULL; + + pb = _pagebuf_find(target, ioff, isize, flags, new_pb); + if (pb == new_pb) { + error = _pagebuf_lookup_pages(pb, flags); + if (error) + goto no_buffer; + } else { + pagebuf_deallocate(new_pb); + if (unlikely(pb == NULL)) + return NULL; + } + + for (i = 0; i < pb->pb_page_count; i++) + mark_page_accessed(pb->pb_pages[i]); + + if (!(pb->pb_flags & PBF_MAPPED)) { + error = _pagebuf_map_pages(pb, flags); + if (unlikely(error)) { + printk(KERN_WARNING "%s: failed to map pages\n", + __FUNCTION__); + goto no_buffer; + } + } + + XFS_STATS_INC(pb_get); + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + pb->pb_bn = ioff; + pb->pb_count_desired = pb->pb_buffer_length; + + PB_TRACE(pb, "get", (unsigned long)flags); + return pb; + + no_buffer: + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; +} + +xfs_buf_t * +xfs_buf_read_flags( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + xfs_buf_t *pb; + + flags |= PBF_READ; + + pb = xfs_buf_get_flags(target, ioff, isize, flags); + if (pb) { + if (PBF_NOT_DONE(pb)) { + PB_TRACE(pb, "read", (unsigned long)flags); + XFS_STATS_INC(pb_get_read); + pagebuf_iostart(pb, flags); + } else if (flags & PBF_ASYNC) { + PB_TRACE(pb, "read_async", (unsigned long)flags); + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + goto no_buffer; + } else { + PB_TRACE(pb, "read_done", (unsigned long)flags); + /* We do not want read in the flags */ + pb->pb_flags &= ~PBF_READ; + } + } + + return pb; + + no_buffer: + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; +} + +/* + * Create a skeletal pagebuf (no pages associated with it). + */ +xfs_buf_t * +pagebuf_lookup( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + xfs_buf_t *pb; + + pb = pagebuf_allocate(flags); + if (pb) { + _pagebuf_initialize(pb, target, ioff, isize, flags); + } + return pb; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +pagebuf_readahead( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + struct backing_dev_info *bdi; + + bdi = target->pbr_mapping->backing_dev_info; + if (bdi_read_congested(bdi)) + return; + + flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); + xfs_buf_read_flags(target, ioff, isize, flags); +} + +xfs_buf_t * +pagebuf_get_empty( + size_t len, + xfs_buftarg_t *target) +{ + xfs_buf_t *pb; + + pb = pagebuf_allocate(0); + if (pb) + _pagebuf_initialize(pb, target, 0, len, 0); + return pb; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if (((unsigned long)addr < VMALLOC_START) || + ((unsigned long)addr >= VMALLOC_END)) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +pagebuf_associate_memory( + xfs_buf_t *pb, + void *mem, + size_t len) +{ + int rval; + int i = 0; + size_t ptr; + size_t end, end_cur; + off_t offset; + int page_count; + + page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); + if (offset && (len > PAGE_CACHE_SIZE)) + page_count++; + + /* Free any previous set of page pointers */ + if (pb->pb_pages) + _pagebuf_free_pages(pb); + + pb->pb_pages = NULL; + pb->pb_addr = mem; + + rval = _pagebuf_get_pages(pb, page_count, 0); + if (rval) + return rval; + + pb->pb_offset = offset; + ptr = (size_t) mem & PAGE_CACHE_MASK; + end = PAGE_CACHE_ALIGN((size_t) mem + len); + end_cur = end; + /* set up first page */ + pb->pb_pages[0] = mem_to_page(mem); + + ptr += PAGE_CACHE_SIZE; + pb->pb_page_count = ++i; + while (ptr < end) { + pb->pb_pages[i] = mem_to_page((void *)ptr); + pb->pb_page_count = ++i; + ptr += PAGE_CACHE_SIZE; + } + pb->pb_locked = 0; + + pb->pb_count_desired = pb->pb_buffer_length = len; + pb->pb_flags |= PBF_MAPPED; + + return 0; +} + +xfs_buf_t * +pagebuf_get_no_daddr( + size_t len, + xfs_buftarg_t *target) +{ + size_t malloc_len = len; + xfs_buf_t *bp; + void *data; + int error; + + bp = pagebuf_allocate(0); + if (unlikely(bp == NULL)) + goto fail; + _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO); + + try_again: + data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); + if (unlikely(data == NULL)) + goto fail_free_buf; + + /* check whether alignment matches.. */ + if ((__psunsigned_t)data != + ((__psunsigned_t)data & ~target->pbr_smask)) { + /* .. else double the size and try again */ + kmem_free(data, malloc_len); + malloc_len <<= 1; + goto try_again; + } + + error = pagebuf_associate_memory(bp, data, len); + if (error) + goto fail_free_mem; + bp->pb_flags |= _PBF_KMEM_ALLOC; + + pagebuf_unlock(bp); + + PB_TRACE(bp, "no_daddr", data); + return bp; + fail_free_mem: + kmem_free(data, malloc_len); + fail_free_buf: + pagebuf_free(bp); + fail: + return NULL; +} + +/* + * pagebuf_hold + * + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * + * Must hold the buffer already to call this function. + */ +void +pagebuf_hold( + xfs_buf_t *pb) +{ + atomic_inc(&pb->pb_hold); + PB_TRACE(pb, "hold", 0); +} + +/* + * pagebuf_rele + * + * pagebuf_rele releases a hold on the specified buffer. If the + * the hold count is 1, pagebuf_rele calls pagebuf_free. + */ +void +pagebuf_rele( + xfs_buf_t *pb) +{ + xfs_bufhash_t *hash = pb->pb_hash; + + PB_TRACE(pb, "rele", pb->pb_relse); + + /* + * pagebuf_lookup buffers are not hashed, not delayed write, + * and don't have their own release routines. Special case. + */ + if (unlikely(!hash)) { + ASSERT(!pb->pb_relse); + if (atomic_dec_and_test(&pb->pb_hold)) + xfs_buf_free(pb); + return; + } + + if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { + int do_free = 1; + + if (pb->pb_relse) { + atomic_inc(&pb->pb_hold); + spin_unlock(&hash->bh_lock); + (*(pb->pb_relse)) (pb); + spin_lock(&hash->bh_lock); + do_free = 0; + } + + if (pb->pb_flags & PBF_DELWRI) { + pb->pb_flags |= PBF_ASYNC; + atomic_inc(&pb->pb_hold); + pagebuf_delwri_queue(pb, 0); + do_free = 0; + } else if (pb->pb_flags & PBF_FS_MANAGED) { + do_free = 0; + } + + if (do_free) { + list_del_init(&pb->pb_hash_list); + spin_unlock(&hash->bh_lock); + pagebuf_free(pb); + } else { + spin_unlock(&hash->bh_lock); + } + } +} + + +/* + * Mutual exclusion on buffers. Locking model: + * + * Buffers associated with inodes for which buffer locking + * is not enabled are not protected by semaphores, and are + * assumed to be exclusively owned by the caller. There is a + * spinlock in the buffer, used by the caller when concurrent + * access is possible. + */ + +/* + * pagebuf_cond_lock + * + * pagebuf_cond_lock locks a buffer object, if it is not already locked. + * Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_cond_lock( /* lock buffer, if not locked */ + /* returns -EBUSY if locked) */ + xfs_buf_t *pb) +{ + int locked; + + locked = down_trylock(&pb->pb_sema) == 0; + if (locked) { + PB_SET_OWNER(pb); + } + PB_TRACE(pb, "cond_lock", (long)locked); + return(locked ? 0 : -EBUSY); +} + +#if defined(DEBUG) || defined(XFS_BLI_TRACE) +/* + * pagebuf_lock_value + * + * Return lock value for a pagebuf + */ +int +pagebuf_lock_value( + xfs_buf_t *pb) +{ + return(atomic_read(&pb->pb_sema.count)); +} +#endif + +/* + * pagebuf_lock + * + * pagebuf_lock locks a buffer object. Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_lock( + xfs_buf_t *pb) +{ + PB_TRACE(pb, "lock", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + down(&pb->pb_sema); + PB_SET_OWNER(pb); + PB_TRACE(pb, "locked", 0); + return 0; +} + +/* + * pagebuf_unlock + * + * pagebuf_unlock releases the lock on the buffer object created by + * pagebuf_lock or pagebuf_cond_lock (not any + * pinning of underlying pages created by pagebuf_pin). + */ +void +pagebuf_unlock( /* unlock buffer */ + xfs_buf_t *pb) /* buffer to unlock */ +{ + PB_CLEAR_OWNER(pb); + up(&pb->pb_sema); + PB_TRACE(pb, "unlock", 0); +} + + +/* + * Pinning Buffer Storage in Memory + */ + +/* + * pagebuf_pin + * + * pagebuf_pin locks all of the memory represented by a buffer in + * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for + * the same or different buffers affecting a given page, will + * properly count the number of outstanding "pin" requests. The + * buffer may be released after the pagebuf_pin and a different + * buffer used when calling pagebuf_unpin, if desired. + * pagebuf_pin should be used by the file system when it wants be + * assured that no attempt will be made to force the affected + * memory to disk. It does not assure that a given logical page + * will not be moved to a different physical page. + */ +void +pagebuf_pin( + xfs_buf_t *pb) +{ + atomic_inc(&pb->pb_pin_count); + PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); +} + +/* + * pagebuf_unpin + * + * pagebuf_unpin reverses the locking of memory performed by + * pagebuf_pin. Note that both functions affected the logical + * pages associated with the buffer, not the buffer itself. + */ +void +pagebuf_unpin( + xfs_buf_t *pb) +{ + if (atomic_dec_and_test(&pb->pb_pin_count)) { + wake_up_all(&pb->pb_waiters); + } + PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); +} + +int +pagebuf_ispin( + xfs_buf_t *pb) +{ + return atomic_read(&pb->pb_pin_count); +} + +/* + * pagebuf_wait_unpin + * + * pagebuf_wait_unpin waits until all of the memory associated + * with the buffer is not longer locked in memory. It returns + * immediately if none of the affected pages are locked. + */ +static inline void +_pagebuf_wait_unpin( + xfs_buf_t *pb) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&pb->pb_pin_count) == 0) + return; + + add_wait_queue(&pb->pb_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&pb->pb_pin_count) == 0) + break; + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + schedule(); + } + remove_wait_queue(&pb->pb_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +/* + * pagebuf_iodone + * + * pagebuf_iodone marks a buffer for which I/O is in progress + * done with respect to that I/O. The pb_iodone routine, if + * present, will be called as a side-effect. + */ +STATIC void +pagebuf_iodone_work( + void *v) +{ + xfs_buf_t *bp = (xfs_buf_t *)v; + + if (bp->pb_iodone) + (*(bp->pb_iodone))(bp); + else if (bp->pb_flags & PBF_ASYNC) + xfs_buf_relse(bp); +} + +void +pagebuf_iodone( + xfs_buf_t *pb, + int dataio, + int schedule) +{ + pb->pb_flags &= ~(PBF_READ | PBF_WRITE); + if (pb->pb_error == 0) { + pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); + } + + PB_TRACE(pb, "iodone", pb->pb_iodone); + + if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + if (schedule) { + INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); + queue_work(dataio ? pagebuf_dataio_workqueue : + pagebuf_logio_workqueue, &pb->pb_iodone_work); + } else { + pagebuf_iodone_work(pb); + } + } else { + up(&pb->pb_iodonesema); + } +} + +/* + * pagebuf_ioerror + * + * pagebuf_ioerror sets the error code for a buffer. + */ +void +pagebuf_ioerror( /* mark/clear buffer error flag */ + xfs_buf_t *pb, /* buffer to mark */ + int error) /* error to store (0 if none) */ +{ + ASSERT(error >= 0 && error <= 0xffff); + pb->pb_error = (unsigned short)error; + PB_TRACE(pb, "ioerror", (unsigned long)error); +} + +/* + * pagebuf_iostart + * + * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. + * If necessary, it will arrange for any disk space allocation required, + * and it will break up the request if the block mappings require it. + * The pb_iodone routine in the buffer supplied will only be called + * when all of the subsidiary I/O requests, if any, have been completed. + * pagebuf_iostart calls the pagebuf_ioinitiate routine or + * pagebuf_iorequest, if the former routine is not defined, to start + * the I/O on a given low-level request. + */ +int +pagebuf_iostart( /* start I/O on a buffer */ + xfs_buf_t *pb, /* buffer to start */ + page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_DELWRI, */ + /* PBF_DONT_BLOCK */ +{ + int status = 0; + + PB_TRACE(pb, "iostart", (unsigned long)flags); + + if (flags & PBF_DELWRI) { + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); + pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC); + pagebuf_delwri_queue(pb, 1); + return status; + } + + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \ + PBF_READ_AHEAD | _PBF_RUN_QUEUES); + pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ + PBF_READ_AHEAD | _PBF_RUN_QUEUES); + + BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL); + + /* For writes allow an alternate strategy routine to precede + * the actual I/O request (which may not be issued at all in + * a shutdown situation, for example). + */ + status = (flags & PBF_WRITE) ? + pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); + + /* Wait for I/O if we are not an async request. + * Note: async I/O request completion will release the buffer, + * and that can already be done by this point. So using the + * buffer pointer from here on, after async I/O, is invalid. + */ + if (!status && !(flags & PBF_ASYNC)) + status = pagebuf_iowait(pb); + + return status; +} + +/* + * Helper routine for pagebuf_iorequest + */ + +STATIC __inline__ int +_pagebuf_iolocked( + xfs_buf_t *pb) +{ + ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); + if (pb->pb_flags & PBF_READ) + return pb->pb_locked; + return 0; +} + +STATIC __inline__ void +_pagebuf_iodone( + xfs_buf_t *pb, + int schedule) +{ + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pb->pb_locked = 0; + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule); + } +} + +STATIC int +bio_end_io_pagebuf( + struct bio *bio, + unsigned int bytes_done, + int error) +{ + xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; + unsigned int i, blocksize = pb->pb_target->pbr_bsize; + struct bio_vec *bvec = bio->bi_io_vec; + + if (bio->bi_size) + return 1; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + pb->pb_error = EIO; + + for (i = 0; i < bio->bi_vcnt; i++, bvec++) { + struct page *page = bvec->bv_page; + + if (pb->pb_error) { + SetPageError(page); + } else if (blocksize == PAGE_CACHE_SIZE) { + SetPageUptodate(page); + } else if (!PagePrivate(page) && + (pb->pb_flags & _PBF_PAGE_CACHE)) { + set_page_region(page, bvec->bv_offset, bvec->bv_len); + } + + if (_pagebuf_iolocked(pb)) { + unlock_page(page); + } + } + + _pagebuf_iodone(pb, 1); + bio_put(bio); + return 0; +} + +STATIC void +_pagebuf_ioapply( + xfs_buf_t *pb) +{ + int i, rw, map_i, total_nr_pages, nr_pages; + struct bio *bio; + int offset = pb->pb_offset; + int size = pb->pb_count_desired; + sector_t sector = pb->pb_bn; + unsigned int blocksize = pb->pb_target->pbr_bsize; + int locking = _pagebuf_iolocked(pb); + + total_nr_pages = pb->pb_page_count; + map_i = 0; + + if (pb->pb_flags & _PBF_RUN_QUEUES) { + pb->pb_flags &= ~_PBF_RUN_QUEUES; + rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC; + } else { + rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; + } + + /* Special code path for reading a sub page size pagebuf in -- + * we populate up the whole page, and hence the other metadata + * in the same page. This optimization is only valid when the + * filesystem block size and the page size are equal. + */ + if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && + (pb->pb_flags & PBF_READ) && locking && + (blocksize == PAGE_CACHE_SIZE)) { + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector - (offset >> BBSHIFT); + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); + size = 0; + + atomic_inc(&pb->pb_io_remaining); + + goto submit_io; + } + + /* Lock down the pages which we need to for the request */ + if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { + for (i = 0; size; i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + struct page *page = pb->pb_pages[i]; + + if (nbytes > size) + nbytes = size; + + lock_page(page); + + size -= nbytes; + offset = 0; + } + offset = pb->pb_offset; + size = pb->pb_count_desired; + } + +next_chunk: + atomic_inc(&pb->pb_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector; + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + for (; size && nr_pages; nr_pages--, map_i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + if (bio_add_page(bio, pb->pb_pages[map_i], + nbytes, offset) < nbytes) + break; + + offset = 0; + sector += nbytes >> BBSHIFT; + size -= nbytes; + total_nr_pages--; + } + +submit_io: + if (likely(bio->bi_size)) { + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + bio_put(bio); + pagebuf_ioerror(pb, EIO); + } +} + +/* + * pagebuf_iorequest -- the core I/O request routine. + */ +int +pagebuf_iorequest( /* start real I/O */ + xfs_buf_t *pb) /* buffer to convey to device */ +{ + PB_TRACE(pb, "iorequest", 0); + + if (pb->pb_flags & PBF_DELWRI) { + pagebuf_delwri_queue(pb, 1); + return 0; + } + + if (pb->pb_flags & PBF_WRITE) { + _pagebuf_wait_unpin(pb); + } + + pagebuf_hold(pb); + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling pagebuf_iodone too early. + */ + atomic_set(&pb->pb_io_remaining, 1); + _pagebuf_ioapply(pb); + _pagebuf_iodone(pb, 0); + + pagebuf_rele(pb); + return 0; +} + +/* + * pagebuf_iowait + * + * pagebuf_iowait waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. In any case, it returns + * the error code, if any, or 0 if there is no error. + */ +int +pagebuf_iowait( + xfs_buf_t *pb) +{ + PB_TRACE(pb, "iowait", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + down(&pb->pb_iodonesema); + PB_TRACE(pb, "iowaited", (long)pb->pb_error); + return pb->pb_error; +} + +caddr_t +pagebuf_offset( + xfs_buf_t *pb, + size_t offset) +{ + struct page *page; + + offset += pb->pb_offset; + + page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; + return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); +} + +/* + * pagebuf_iomove + * + * Move data into or out of a buffer. + */ +void +pagebuf_iomove( + xfs_buf_t *pb, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + caddr_t data, /* data address */ + page_buf_rw_t mode) /* read/write flag */ +{ + size_t bend, cpoff, csize; + struct page *page; + + bend = boff + bsize; + while (boff < bend) { + page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; + cpoff = page_buf_poff(boff + pb->pb_offset); + csize = min_t(size_t, + PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); + + ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); + + switch (mode) { + case PBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + break; + case PBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + break; + case PBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buftargs. + */ + +/* + * Wait for any bufs with callbacks that have been submitted but + * have not yet returned... walk the hash list for the target. + */ +void +xfs_wait_buftarg( + xfs_buftarg_t *btp) +{ + xfs_buf_t *bp, *n; + xfs_bufhash_t *hash; + uint i; + + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + hash = &btp->bt_hash[i]; +again: + spin_lock(&hash->bh_lock); + list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == bp->pb_target); + if (!(bp->pb_flags & PBF_FS_MANAGED)) { + spin_unlock(&hash->bh_lock); + delay(100); + goto again; + } + } + spin_unlock(&hash->bh_lock); + } +} + +/* + * Allocate buffer hash table for a given target. + * For devices containing metadata (i.e. not the log/realtime devices) + * we need to allocate a much larger hash table. + */ +STATIC void +xfs_alloc_bufhash( + xfs_buftarg_t *btp, + int external) +{ + unsigned int i; + + btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ + btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * + sizeof(xfs_bufhash_t), KM_SLEEP); + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + spin_lock_init(&btp->bt_hash[i].bh_lock); + INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); + } +} + +STATIC void +xfs_free_bufhash( + xfs_buftarg_t *btp) +{ + kmem_free(btp->bt_hash, + (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + btp->bt_hash = NULL; +} + +void +xfs_free_buftarg( + xfs_buftarg_t *btp, + int external) +{ + xfs_flush_buftarg(btp, 1); + if (external) + xfs_blkdev_put(btp->pbr_bdev); + xfs_free_bufhash(btp); + iput(btp->pbr_mapping->host); + kmem_free(btp, sizeof(*btp)); +} + +void +xfs_incore_relse( + xfs_buftarg_t *btp, + int delwri_only, + int wait) +{ + invalidate_bdev(btp->pbr_bdev, 1); + truncate_inode_pages(btp->pbr_mapping, 0LL); +} + +STATIC int +xfs_setsize_buftarg_flags( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize, + int verbose) +{ + btp->pbr_bsize = blocksize; + btp->pbr_sshift = ffs(sectorsize) - 1; + btp->pbr_smask = sectorsize - 1; + + if (set_blocksize(btp->pbr_bdev, sectorsize)) { + printk(KERN_WARNING + "XFS: Cannot set_blocksize to %u on device %s\n", + sectorsize, XFS_BUFTARG_NAME(btp)); + return EINVAL; + } + + if (verbose && + (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { + printk(KERN_WARNING + "XFS: %u byte sectors in use on device %s. " + "This is suboptimal; %u or greater is ideal.\n", + sectorsize, XFS_BUFTARG_NAME(btp), + (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); + } + + return 0; +} + +/* +* When allocating the initial buffer target we have not yet +* read in the superblock, so don't know what sized sectors +* are being used is at this early stage. Play safe. +*/ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg_flags(btp, + PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize) +{ + return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); +} + +STATIC int +xfs_mapping_buftarg( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + struct backing_dev_info *bdi; + struct inode *inode; + struct address_space *mapping; + static struct address_space_operations mapping_aops = { + .sync_page = block_sync_page, + }; + + inode = new_inode(bdev->bd_inode->i_sb); + if (!inode) { + printk(KERN_WARNING + "XFS: Cannot allocate mapping inode for device %s\n", + XFS_BUFTARG_NAME(btp)); + return ENOMEM; + } + inode->i_mode = S_IFBLK; + inode->i_bdev = bdev; + inode->i_rdev = bdev->bd_dev; + bdi = blk_get_backing_dev_info(bdev); + if (!bdi) + bdi = &default_backing_dev_info; + mapping = &inode->i_data; + mapping->a_ops = &mapping_aops; + mapping->backing_dev_info = bdi; + mapping_set_gfp_mask(mapping, GFP_NOFS); + btp->pbr_mapping = mapping; + return 0; +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct block_device *bdev, + int external) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + + btp->pbr_dev = bdev->bd_dev; + btp->pbr_bdev = bdev; + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + if (xfs_mapping_buftarg(btp, bdev)) + goto error; + xfs_alloc_bufhash(btp, external); + return btp; + +error: + kmem_free(btp, sizeof(*btp)); + return NULL; +} + + +/* + * Pagebuf delayed write buffer handling + */ + +STATIC LIST_HEAD(pbd_delwrite_queue); +STATIC DEFINE_SPINLOCK(pbd_delwrite_lock); + +STATIC void +pagebuf_delwri_queue( + xfs_buf_t *pb, + int unlock) +{ + PB_TRACE(pb, "delwri_q", (long)unlock); + ASSERT(pb->pb_flags & PBF_DELWRI); + + spin_lock(&pbd_delwrite_lock); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&pb->pb_list)) { + if (unlock) { + atomic_dec(&pb->pb_hold); + } + list_del(&pb->pb_list); + } + + list_add_tail(&pb->pb_list, &pbd_delwrite_queue); + pb->pb_queuetime = jiffies; + spin_unlock(&pbd_delwrite_lock); + + if (unlock) + pagebuf_unlock(pb); +} + +void +pagebuf_delwri_dequeue( + xfs_buf_t *pb) +{ + int dequeued = 0; + + spin_lock(&pbd_delwrite_lock); + if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { + list_del_init(&pb->pb_list); + dequeued = 1; + } + pb->pb_flags &= ~PBF_DELWRI; + spin_unlock(&pbd_delwrite_lock); + + if (dequeued) + pagebuf_rele(pb); + + PB_TRACE(pb, "delwri_dq", (long)dequeued); +} + +STATIC void +pagebuf_runall_queues( + struct workqueue_struct *queue) +{ + flush_workqueue(queue); +} + +/* Defines for pagebuf daemon */ +STATIC DECLARE_COMPLETION(pagebuf_daemon_done); +STATIC struct task_struct *pagebuf_daemon_task; +STATIC int pagebuf_daemon_active; +STATIC int force_flush; + + +STATIC int +pagebuf_daemon_wakeup( + int priority, + unsigned int mask) +{ + force_flush = 1; + barrier(); + wake_up_process(pagebuf_daemon_task); + return 0; +} + +STATIC int +pagebuf_daemon( + void *data) +{ + struct list_head tmp; + unsigned long age; + xfs_buftarg_t *target; + xfs_buf_t *pb, *n; + + /* Set up the thread */ + daemonize("xfsbufd"); + current->flags |= PF_MEMALLOC; + + pagebuf_daemon_task = current; + pagebuf_daemon_active = 1; + barrier(); + + INIT_LIST_HEAD(&tmp); + do { + try_to_freeze(PF_FREEZE); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100); + + age = (xfs_buf_age_centisecs * HZ) / 100; + spin_lock(&pbd_delwrite_lock); + list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { + PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); + ASSERT(pb->pb_flags & PBF_DELWRI); + + if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { + if (!force_flush && + time_before(jiffies, + pb->pb_queuetime + age)) { + pagebuf_unlock(pb); + break; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + list_move(&pb->pb_list, &tmp); + } + } + spin_unlock(&pbd_delwrite_lock); + + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, xfs_buf_t, pb_list); + target = pb->pb_target; + + list_del_init(&pb->pb_list); + pagebuf_iostrategy(pb); + + blk_run_address_space(target->pbr_mapping); + } + + if (as_list_len > 0) + purge_addresses(); + + force_flush = 0; + } while (pagebuf_daemon_active); + + complete_and_exit(&pagebuf_daemon_done, 0); +} + +/* + * Go through all incore buffers, and release buffers if they belong to + * the given device. This is used in filesystem error handling to + * preserve the consistency of its metadata. + */ +int +xfs_flush_buftarg( + xfs_buftarg_t *target, + int wait) +{ + struct list_head tmp; + xfs_buf_t *pb, *n; + int pincount = 0; + + pagebuf_runall_queues(pagebuf_dataio_workqueue); + pagebuf_runall_queues(pagebuf_logio_workqueue); + + INIT_LIST_HEAD(&tmp); + spin_lock(&pbd_delwrite_lock); + list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { + + if (pb->pb_target != target) + continue; + + ASSERT(pb->pb_flags & PBF_DELWRI); + PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); + if (pagebuf_ispin(pb)) { + pincount++; + continue; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + list_move(&pb->pb_list, &tmp); + } + spin_unlock(&pbd_delwrite_lock); + + /* + * Dropped the delayed write list lock, now walk the temporary list + */ + list_for_each_entry_safe(pb, n, &tmp, pb_list) { + if (wait) + pb->pb_flags &= ~PBF_ASYNC; + else + list_del_init(&pb->pb_list); + + pagebuf_lock(pb); + pagebuf_iostrategy(pb); + } + + /* + * Remaining list items must be flushed before returning + */ + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, xfs_buf_t, pb_list); + + list_del_init(&pb->pb_list); + xfs_iowait(pb); + xfs_buf_relse(pb); + } + + if (wait) + blk_run_address_space(target->pbr_mapping); + + return pincount; +} + +STATIC int +pagebuf_daemon_start(void) +{ + int rval; + + pagebuf_logio_workqueue = create_workqueue("xfslogd"); + if (!pagebuf_logio_workqueue) + return -ENOMEM; + + pagebuf_dataio_workqueue = create_workqueue("xfsdatad"); + if (!pagebuf_dataio_workqueue) { + destroy_workqueue(pagebuf_logio_workqueue); + return -ENOMEM; + } + + rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES); + if (rval < 0) { + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); + } + + return rval; +} + +/* + * pagebuf_daemon_stop + * + * Note: do not mark as __exit, it is called from pagebuf_terminate. + */ +STATIC void +pagebuf_daemon_stop(void) +{ + pagebuf_daemon_active = 0; + barrier(); + wait_for_completion(&pagebuf_daemon_done); + + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); +} + +/* + * Initialization and Termination + */ + +int __init +pagebuf_init(void) +{ + pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (pagebuf_cache == NULL) { + printk("XFS: couldn't init xfs_buf_t cache\n"); + pagebuf_terminate(); + return -ENOMEM; + } + +#ifdef PAGEBUF_TRACE + pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); +#endif + + pagebuf_daemon_start(); + + pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup); + if (pagebuf_shake == NULL) { + pagebuf_terminate(); + return -ENOMEM; + } + + return 0; +} + + +/* + * pagebuf_terminate. + * + * Note: do not mark as __exit, this is also called from the __init code. + */ +void +pagebuf_terminate(void) +{ + pagebuf_daemon_stop(); + +#ifdef PAGEBUF_TRACE + ktrace_free(pagebuf_trace_buf); +#endif + + kmem_zone_destroy(pagebuf_cache); + kmem_shake_deregister(pagebuf_shake); +} diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h new file mode 100644 index 00000000000..74deed8e6d9 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI + */ + +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) +#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum page_buf_rw_e { + PBRW_READ = 1, /* transfer into target memory */ + PBRW_WRITE = 2, /* transfer from target memory */ + PBRW_ZERO = 3 /* Zero target memory */ +} page_buf_rw_t; + + +typedef enum page_buf_flags_e { /* pb_flags values */ + PBF_READ = (1 << 0), /* buffer intended for reading from device */ + PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ + PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ + PBF_PARTIAL = (1 << 3), /* buffer partially read */ + PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ + PBF_NONE = (1 << 5), /* buffer not read at all */ + PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ + PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ + PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ + PBF_FS_DATAIOD = (1 << 9), /* schedule IO completion on fs datad */ + PBF_FORCEIO = (1 << 10), /* ignore any cache state */ + PBF_FLUSH = (1 << 11), /* flush disk write cache */ + PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ + + /* flags used only as arguments to access routines */ + PBF_LOCK = (1 << 14), /* lock requested */ + PBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ + PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ + + /* flags used only internally */ + _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ + _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ +} page_buf_flags_t; + +#define PBF_UPDATE (PBF_READ | PBF_WRITE) +#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0) +#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0) + +typedef struct xfs_bufhash { + struct list_head bh_list; + spinlock_t bh_lock; +} xfs_bufhash_t; + +typedef struct xfs_buftarg { + dev_t pbr_dev; + struct block_device *pbr_bdev; + struct address_space *pbr_mapping; + unsigned int pbr_bsize; + unsigned int pbr_sshift; + size_t pbr_smask; + + /* per-device buffer hash table */ + uint bt_hashmask; + uint bt_hashshift; + xfs_bufhash_t *bt_hash; +} xfs_buftarg_t; + +/* + * xfs_buf_t: Buffer structure for page cache-based buffers + * + * This buffer structure is used by the page cache buffer management routines + * to refer to an assembly of pages forming a logical buffer. The actual I/O + * is performed with buffer_head structures, as required by drivers. + * + * The buffer structure is used on temporary basis only, and discarded when + * released. The real data storage is recorded in the page cache. Metadata is + * hashed to the block device on which the file system resides. + */ + +struct xfs_buf; + +/* call-back function on I/O completion */ +typedef void (*page_buf_iodone_t)(struct xfs_buf *); +/* call-back function on I/O completion */ +typedef void (*page_buf_relse_t)(struct xfs_buf *); +/* pre-write function */ +typedef int (*page_buf_bdstrat_t)(struct xfs_buf *); + +#define PB_PAGES 2 + +typedef struct xfs_buf { + struct semaphore pb_sema; /* semaphore for lockables */ + unsigned long pb_queuetime; /* time buffer was queued */ + atomic_t pb_pin_count; /* pin count */ + wait_queue_head_t pb_waiters; /* unpin waiters */ + struct list_head pb_list; + page_buf_flags_t pb_flags; /* status flags */ + struct list_head pb_hash_list; /* hash table list */ + xfs_bufhash_t *pb_hash; /* hash table list start */ + xfs_buftarg_t *pb_target; /* buffer target (device) */ + atomic_t pb_hold; /* reference count */ + xfs_daddr_t pb_bn; /* block number for I/O */ + loff_t pb_file_offset; /* offset in file */ + size_t pb_buffer_length; /* size of buffer in bytes */ + size_t pb_count_desired; /* desired transfer size */ + void *pb_addr; /* virtual address of buffer */ + struct work_struct pb_iodone_work; + atomic_t pb_io_remaining;/* #outstanding I/O requests */ + page_buf_iodone_t pb_iodone; /* I/O completion function */ + page_buf_relse_t pb_relse; /* releasing function */ + page_buf_bdstrat_t pb_strat; /* pre-write function */ + struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + void *pb_fspriv; + void *pb_fspriv2; + void *pb_fspriv3; + unsigned short pb_error; /* error code on I/O */ + unsigned short pb_locked; /* page array is locked */ + unsigned int pb_page_count; /* size of page array */ + unsigned int pb_offset; /* page offset in first page */ + struct page **pb_pages; /* array of page pointers */ + struct page *pb_page_array[PB_PAGES]; /* inline pages */ +#ifdef PAGEBUF_LOCK_TRACKING + int pb_last_holder; +#endif +} xfs_buf_t; + + +/* Finding and Reading Buffers */ + +extern xfs_buf_t *_pagebuf_find( /* find buffer for block if */ + /* the block is in memory */ + xfs_buftarg_t *, /* inode for block */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t, /* PBF_LOCK */ + xfs_buf_t *); /* newly allocated buffer */ + +#define xfs_incore(buftarg,blkno,len,lockit) \ + _pagebuf_find(buftarg, blkno ,len, lockit, NULL) + +extern xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */ + xfs_buftarg_t *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_READ, */ + /* PBF_ASYNC */ + +#define xfs_buf_get(target, blkno, len, flags) \ + xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) + +extern xfs_buf_t *xfs_buf_read_flags( /* allocate and read a buffer */ + xfs_buftarg_t *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC */ + +#define xfs_buf_read(target, blkno, len, flags) \ + xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) + +extern xfs_buf_t *pagebuf_lookup( + xfs_buftarg_t *, + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_READ, PBF_WRITE, */ + /* PBF_FORCEIO, */ + +extern xfs_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ + /* no memory or disk address */ + size_t len, + xfs_buftarg_t *); /* mount point "fake" inode */ + +extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ + /* without disk address */ + size_t len, + xfs_buftarg_t *); /* mount point "fake" inode */ + +extern int pagebuf_associate_memory( + xfs_buf_t *, + void *, + size_t); + +extern void pagebuf_hold( /* increment reference count */ + xfs_buf_t *); /* buffer to hold */ + +extern void pagebuf_readahead( /* read ahead into cache */ + xfs_buftarg_t *, /* target for buffer (or NULL) */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* additional read flags */ + +/* Releasing Buffers */ + +extern void pagebuf_free( /* deallocate a buffer */ + xfs_buf_t *); /* buffer to deallocate */ + +extern void pagebuf_rele( /* release hold on a buffer */ + xfs_buf_t *); /* buffer to release */ + +/* Locking and Unlocking Buffers */ + +extern int pagebuf_cond_lock( /* lock buffer, if not locked */ + /* (returns -EBUSY if locked) */ + xfs_buf_t *); /* buffer to lock */ + +extern int pagebuf_lock_value( /* return count on lock */ + xfs_buf_t *); /* buffer to check */ + +extern int pagebuf_lock( /* lock buffer */ + xfs_buf_t *); /* buffer to lock */ + +extern void pagebuf_unlock( /* unlock buffer */ + xfs_buf_t *); /* buffer to unlock */ + +/* Buffer Read and Write Routines */ + +extern void pagebuf_iodone( /* mark buffer I/O complete */ + xfs_buf_t *, /* buffer to mark */ + int, /* use data/log helper thread. */ + int); /* run completion locally, or in + * a helper thread. */ + +extern void pagebuf_ioerror( /* mark buffer in error (or not) */ + xfs_buf_t *, /* buffer to mark */ + int); /* error to store (0 if none) */ + +extern int pagebuf_iostart( /* start I/O on a buffer */ + xfs_buf_t *, /* buffer to start */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */ + /* PBF_READ, PBF_WRITE, */ + /* PBF_DELWRI */ + +extern int pagebuf_iorequest( /* start real I/O */ + xfs_buf_t *); /* buffer to convey to device */ + +extern int pagebuf_iowait( /* wait for buffer I/O done */ + xfs_buf_t *); /* buffer to wait on */ + +extern void pagebuf_iomove( /* move data in/out of pagebuf */ + xfs_buf_t *, /* buffer to manipulate */ + size_t, /* starting buffer offset */ + size_t, /* length in buffer */ + caddr_t, /* data pointer */ + page_buf_rw_t); /* direction */ + +static inline int pagebuf_iostrategy(xfs_buf_t *pb) +{ + return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); +} + +static inline int pagebuf_geterror(xfs_buf_t *pb) +{ + return pb ? pb->pb_error : ENOMEM; +} + +/* Buffer Utility Routines */ + +extern caddr_t pagebuf_offset( /* pointer at offset in buffer */ + xfs_buf_t *, /* buffer to offset into */ + size_t); /* offset */ + +/* Pinning Buffer Storage in Memory */ + +extern void pagebuf_pin( /* pin buffer in memory */ + xfs_buf_t *); /* buffer to pin */ + +extern void pagebuf_unpin( /* unpin buffered data */ + xfs_buf_t *); /* buffer to unpin */ + +extern int pagebuf_ispin( /* check if buffer is pinned */ + xfs_buf_t *); /* buffer to check */ + +/* Delayed Write Buffer Routines */ + +extern void pagebuf_delwri_dequeue(xfs_buf_t *); + +/* Buffer Daemon Setup Routines */ + +extern int pagebuf_init(void); +extern void pagebuf_terminate(void); + + +#ifdef PAGEBUF_TRACE +extern ktrace_t *pagebuf_trace_buf; +extern void pagebuf_trace( + xfs_buf_t *, /* buffer being traced */ + char *, /* description of operation */ + void *, /* arbitrary diagnostic value */ + void *); /* return address */ +#else +# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) +#endif + +#define pagebuf_target_name(target) \ + ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) + + + + + +/* These are just for xfs_syncsub... it sets an internal variable + * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t + */ +#define XFS_B_ASYNC PBF_ASYNC +#define XFS_B_DELWRI PBF_DELWRI +#define XFS_B_READ PBF_READ +#define XFS_B_WRITE PBF_WRITE +#define XFS_B_STALE PBF_STALE + +#define XFS_BUF_TRYLOCK PBF_TRYLOCK +#define XFS_INCORE_TRYLOCK PBF_TRYLOCK +#define XFS_BUF_LOCK PBF_LOCK +#define XFS_BUF_MAPPED PBF_MAPPED + +#define BUF_BUSY PBF_DONT_BLOCK + +#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) +#define XFS_BUF_ZEROFLAGS(x) \ + ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI)) + +#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) +#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) +#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) +#define XFS_BUF_SUPER_STALE(x) do { \ + XFS_BUF_STALE(x); \ + pagebuf_delwri_dequeue(x); \ + XFS_BUF_DONE(x); \ + } while (0) + +#define XFS_BUF_MANAGE PBF_FS_MANAGED +#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) + +#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(x) pagebuf_delwri_dequeue(x) +#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) + +#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) +#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) +#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) + +#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)) +#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE) +#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x))) + +#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO) +#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO) +#define XFS_BUF_ISBUSY(x) (1) + +#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) +#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) +#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) + +#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) +#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) +#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) + +#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") +#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") +#define XFS_BUF_ISSHUT(x) (0) + +#define XFS_BUF_HOLD(x) pagebuf_hold(x) +#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) +#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) +#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) + +#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) +#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) +#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) + +#define XFS_BUF_ISUNINITIAL(x) (0) +#define XFS_BUF_UNUNINITIAL(x) (0) + +#define XFS_BUF_BP_ISMAPPED(bp) 1 + +#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD) +#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD) + +#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone +#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ + (buf)->pb_iodone = (func) +#define XFS_BUF_CLR_IODONE_FUNC(buf) \ + (buf)->pb_iodone = NULL +#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ + (buf)->pb_strat = (func) +#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ + (buf)->pb_strat = NULL + +#define XFS_BUF_FSPRIVATE(buf, type) \ + ((type)(buf)->pb_fspriv) +#define XFS_BUF_SET_FSPRIVATE(buf, value) \ + (buf)->pb_fspriv = (void *)(value) +#define XFS_BUF_FSPRIVATE2(buf, type) \ + ((type)(buf)->pb_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ + (buf)->pb_fspriv2 = (void *)(value) +#define XFS_BUF_FSPRIVATE3(buf, type) \ + ((type)(buf)->pb_fspriv3) +#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ + (buf)->pb_fspriv3 = (void *)(value) +#define XFS_BUF_SET_START(buf) + +#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ + (buf)->pb_relse = (value) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) + +extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset) +{ + if (bp->pb_flags & PBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; + return (xfs_caddr_t) pagebuf_offset(bp, offset); +} + +#define XFS_BUF_SET_PTR(bp, val, count) \ + pagebuf_associate_memory(bp, val, count) +#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) +#define XFS_BUF_SET_ADDR(bp, blk) \ + ((bp)->pb_bn = (xfs_daddr_t)(blk)) +#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) +#define XFS_BUF_SET_OFFSET(bp, off) \ + ((bp)->pb_file_offset = (off)) +#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) \ + ((bp)->pb_count_desired = (cnt)) +#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) \ + ((bp)->pb_buffer_length = (cnt)) +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) + +#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) +#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); + +/* setup the buffer target from a buftarg structure */ +#define XFS_BUF_SET_TARGET(bp, target) \ + (bp)->pb_target = (target) +#define XFS_BUF_TARGET(bp) ((bp)->pb_target) +#define XFS_BUFTARG_NAME(target) \ + pagebuf_target_name(target) + +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) +{ + bp->pb_fspriv3 = mp; + bp->pb_strat = xfs_bdstrat_cb; + pagebuf_delwri_dequeue(bp); + return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES); +} + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + if (!bp->pb_relse) + pagebuf_unlock(bp); + pagebuf_rele(bp); +} + +#define xfs_bpin(bp) pagebuf_pin(bp) +#define xfs_bunpin(bp) pagebuf_unpin(bp) + +#define xfs_buftrace(id, bp) \ + pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) + +#define xfs_biodone(pb) \ + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0) + +#define xfs_biomove(pb, off, len, data, rw) \ + pagebuf_iomove((pb), (off), (len), (data), \ + ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) + +#define xfs_biozero(pb, off, len) \ + pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) + + +static inline int XFS_bwrite(xfs_buf_t *pb) +{ + int iowait = (pb->pb_flags & PBF_ASYNC) == 0; + int error = 0; + + if (!iowait) + pb->pb_flags |= _PBF_RUN_QUEUES; + + pagebuf_delwri_dequeue(pb); + pagebuf_iostrategy(pb); + if (iowait) { + error = pagebuf_iowait(pb); + xfs_buf_relse(pb); + } + return error; +} + +#define XFS_bdwrite(pb) \ + pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) + +static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) +{ + bp->pb_strat = xfs_bdstrat_cb; + bp->pb_fspriv3 = mp; + + return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); +} + +#define XFS_bdstrat(bp) pagebuf_iorequest(bp) + +#define xfs_iowait(pb) pagebuf_iowait(pb) + +#define xfs_baread(target, rablkno, ralen) \ + pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) + +#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target)) +#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target)) +#define xfs_buf_free(bp) pagebuf_free(bp) + + +/* + * Handling of buftargs. + */ + +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern void xfs_free_buftarg(xfs_buftarg_t *, int); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern void xfs_incore_relse(xfs_buftarg_t *, int, int); +extern int xfs_flush_buftarg(xfs_buftarg_t *, int); + +#define xfs_getsize_buftarg(buftarg) \ + block_size((buftarg)->pbr_bdev) +#define xfs_readonly_buftarg(buftarg) \ + bdev_read_only((buftarg)->pbr_bdev) +#define xfs_binval(buftarg) \ + xfs_flush_buftarg(buftarg, 1) +#define XFS_bflush(buftarg) \ + xfs_flush_buftarg(buftarg, 1) + +#endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h new file mode 100644 index 00000000000..00c45849d41 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CRED_H__ +#define __XFS_CRED_H__ + +/* + * Credentials + */ +typedef struct cred { + /* EMPTY */ +} cred_t; + +extern struct cred *sys_cred; + +/* this is a hack.. (assums sys_cred is the only cred_t in the system) */ +static __inline int capable_cred(cred_t *cr, int cid) +{ + return (cr == sys_cred) ? 1 : capable(cid); +} + +#endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c new file mode 100644 index 00000000000..f372a1a5e16 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_dmapi.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_mount.h" +#include "xfs_export.h" + +/* + * XFS encode and decodes the fileid portion of NFS filehandles + * itself instead of letting the generic NFS code do it. This + * allows filesystems with 64 bit inode numbers to be exported. + * + * Note that a side effect is that xfs_vget() won't be passed a + * zero inode/generation pair under normal circumstances. As + * however a malicious client could send us such data, the check + * remains in that code. + */ + + +STATIC struct dentry * +linvfs_decode_fh( + struct super_block *sb, + __u32 *fh, + int fh_len, + int fileid_type, + int (*acceptable)( + void *context, + struct dentry *de), + void *context) +{ + xfs_fid2_t ifid; + xfs_fid2_t pfid; + void *parent = NULL; + int is64 = 0; + __u32 *p = fh; + +#if XFS_BIG_INUMS + is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG); + fileid_type &= ~XFS_FILEID_TYPE_64FLAG; +#endif + + /* + * Note that we only accept fileids which are long enough + * rather than allow the parent generation number to default + * to zero. XFS considers zero a valid generation number not + * an invalid/wildcard value. There's little point printk'ing + * a warning here as we don't have the client information + * which would make such a warning useful. + */ + if (fileid_type > 2 || + fh_len < xfs_fileid_length((fileid_type == 2), is64)) + return NULL; + + p = xfs_fileid_decode_fid2(p, &ifid, is64); + + if (fileid_type == 2) { + p = xfs_fileid_decode_fid2(p, &pfid, is64); + parent = &pfid; + } + + fh = (__u32 *)&ifid; + return find_exported_dentry(sb, fh, parent, acceptable, context); +} + + +STATIC int +linvfs_encode_fh( + struct dentry *dentry, + __u32 *fh, + int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + int type = 1; + __u32 *p = fh; + int len; + int is64 = 0; +#if XFS_BIG_INUMS + vfs_t *vfs = LINVFS_GET_VFS(inode->i_sb); + xfs_mount_t *mp = XFS_VFSTOM(vfs); + + if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT)) { + /* filesystem may contain 64bit inode numbers */ + is64 = XFS_FILEID_TYPE_64FLAG; + } +#endif + + /* Directories don't need their parent encoded, they have ".." */ + if (S_ISDIR(inode->i_mode)) + connectable = 0; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(connectable, is64); + if (*max_len < len) + return 255; + *max_len = len; + + p = xfs_fileid_encode_inode(p, inode, is64); + if (connectable) { + spin_lock(&dentry->d_lock); + p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64); + spin_unlock(&dentry->d_lock); + type = 2; + } + BUG_ON((p - fh) != len); + return type | is64; +} + +STATIC struct dentry * +linvfs_get_dentry( + struct super_block *sb, + void *data) +{ + vnode_t *vp; + struct inode *inode; + struct dentry *result; + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_VGET(vfsp, &vp, (fid_t *)data, error); + if (error || vp == NULL) + return ERR_PTR(-ESTALE) ; + + inode = LINVFS_GET_IP(vp); + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + return result; +} + +STATIC struct dentry * +linvfs_get_parent( + struct dentry *child) +{ + int error; + vnode_t *vp, *cvp; + struct dentry *parent; + struct dentry dotdot; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + dotdot.d_inode = NULL; + + cvp = NULL; + vp = LINVFS_GET_VP(child->d_inode); + VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error); + if (unlikely(error)) + return ERR_PTR(-error); + + parent = d_alloc_anon(LINVFS_GET_IP(cvp)); + if (unlikely(!parent)) { + VN_RELE(cvp); + return ERR_PTR(-ENOMEM); + } + return parent; +} + +struct export_operations linvfs_export_ops = { + .decode_fh = linvfs_decode_fh, + .encode_fh = linvfs_encode_fh, + .get_parent = linvfs_get_parent, + .get_dentry = linvfs_get_dentry, +}; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h new file mode 100644 index 00000000000..60b2abac1c1 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_EXPORT_H__ +#define __XFS_EXPORT_H__ + +/* + * Common defines for code related to exporting XFS filesystems over NFS. + * + * The NFS fileid goes out on the wire as an array of + * 32bit unsigned ints in host order. There are 5 possible + * formats. + * + * (1) fileid_type=0x00 + * (no fileid data; handled by the generic code) + * + * (2) fileid_type=0x01 + * inode-num + * generation + * + * (3) fileid_type=0x02 + * inode-num + * generation + * parent-inode-num + * parent-generation + * + * (4) fileid_type=0x81 + * inode-num-lo32 + * inode-num-hi32 + * generation + * + * (5) fileid_type=0x82 + * inode-num-lo32 + * inode-num-hi32 + * generation + * parent-inode-num-lo32 + * parent-inode-num-hi32 + * parent-generation + * + * Note, the NFS filehandle also includes an fsid portion which + * may have an inode number in it. That number is hardcoded to + * 32bits and there is no way for XFS to intercept it. In + * practice this means when exporting an XFS filesytem with 64bit + * inodes you should either export the mountpoint (rather than + * a subdirectory) or use the "fsid" export option. + */ + +/* This flag goes on the wire. Don't play with it. */ +#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ + +/* Calculate the length in u32 units of the fileid data */ +static inline int +xfs_fileid_length(int hasparent, int is64) +{ + return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2); +} + +/* + * Decode encoded inode information (either for the inode itself + * or the parent) into an xfs_fid2_t structure. Advances and + * returns the new data pointer + */ +static inline __u32 * +xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64) +{ + fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len); + fid->fid_pad = 0; + fid->fid_ino = *p++; +#if XFS_BIG_INUMS + if (is64) + fid->fid_ino |= (((__u64)(*p++)) << 32); +#endif + fid->fid_gen = *p++; + return p; +} + +/* + * Encode inode information (either for the inode itself or the + * parent) into a fileid buffer. Advances and returns the new + * data pointer. + */ +static inline __u32 * +xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64) +{ + *p++ = (__u32)inode->i_ino; +#if XFS_BIG_INUMS + if (is64) + *p++ = (__u32)(inode->i_ino >> 32); +#endif + *p++ = inode->i_generation; + return p; +} + +#endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c new file mode 100644 index 00000000000..9f057a4a5b0 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_trans.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_ioctl32.h" + +#include +#include + +static struct vm_operations_struct linvfs_file_vm_ops; + + +STATIC inline ssize_t +__linvfs_read( + struct kiocb *iocb, + char __user *buf, + int ioflags, + size_t count, + loff_t pos) +{ + struct iovec iov = {buf, count}; + struct file *file = iocb->ki_filp; + vnode_t *vp = LINVFS_GET_VP(file->f_dentry->d_inode); + ssize_t rval; + + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval); + return rval; +} + + +STATIC ssize_t +linvfs_aio_read( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, IO_ISAIO, count, pos); +} + +STATIC ssize_t +linvfs_aio_read_invis( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_write( + struct kiocb *iocb, + const char __user *buf, + int ioflags, + size_t count, + loff_t pos) +{ + struct iovec iov = {(void __user *)buf, count}; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + ssize_t rval; + + BUG_ON(iocb->ki_pos != pos); + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + + VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval); + return rval; +} + + +STATIC ssize_t +linvfs_aio_write( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, IO_ISAIO, count, pos); +} + +STATIC ssize_t +linvfs_aio_write_invis( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_readv( + struct file *file, + const struct iovec *iov, + int ioflags, + unsigned long nr_segs, + loff_t *ppos) +{ + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + struct kiocb kiocb; + ssize_t rval; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval); + + *ppos = kiocb.ki_pos; + return rval; +} + +STATIC ssize_t +linvfs_readv( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_readv(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_readv_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos); +} + + +STATIC inline ssize_t +__linvfs_writev( + struct file *file, + const struct iovec *iov, + int ioflags, + unsigned long nr_segs, + loff_t *ppos) +{ + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + struct kiocb kiocb; + ssize_t rval; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + + VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval); + + *ppos = kiocb.ki_pos; + return rval; +} + + +STATIC ssize_t +linvfs_writev( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_writev_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_sendfile( + struct file *filp, + loff_t *ppos, + size_t count, + read_actor_t actor, + void *target) +{ + vnode_t *vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ssize_t rval; + + VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval); + return rval; +} + + +STATIC int +linvfs_open( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + + ASSERT(vp); + VOP_OPEN(vp, NULL, error); + return -error; +} + + +STATIC int +linvfs_release( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0; + + if (vp) + VOP_RELEASE(vp, error); + return -error; +} + + +STATIC int +linvfs_fsync( + struct file *filp, + struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + int flags = FSYNC_WAIT; + + if (datasync) + flags |= FSYNC_DATA; + + ASSERT(vp); + VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error); + return -error; +} + +/* + * linvfs_readdir maps to VOP_READDIR(). + * We need to build a uio, cred, ... + */ + +#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) + +STATIC int +linvfs_readdir( + struct file *filp, + void *dirent, + filldir_t filldir) +{ + int error = 0; + vnode_t *vp; + uio_t uio; + iovec_t iov; + int eof = 0; + caddr_t read_buf; + int namelen, size = 0; + size_t rlen = PAGE_CACHE_SIZE; + xfs_off_t start_offset, curr_offset; + xfs_dirent_t *dbp = NULL; + + vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ASSERT(vp); + + /* Try fairly hard to get memory */ + do { + if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) + break; + rlen >>= 1; + } while (rlen >= 1024); + + if (read_buf == NULL) + return -ENOMEM; + + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + curr_offset = filp->f_pos; + if (filp->f_pos != 0x7fffffff) + uio.uio_offset = filp->f_pos; + else + uio.uio_offset = 0xffffffff; + + while (!eof) { + uio.uio_resid = iov.iov_len = rlen; + iov.iov_base = read_buf; + uio.uio_iovcnt = 1; + + start_offset = uio.uio_offset; + + VOP_READDIR(vp, &uio, NULL, &eof, error); + if ((uio.uio_offset == start_offset) || error) { + size = 0; + break; + } + + size = rlen - uio.uio_resid; + dbp = (xfs_dirent_t *)read_buf; + while (size > 0) { + namelen = strlen(dbp->d_name); + + if (filldir(dirent, dbp->d_name, namelen, + (loff_t) curr_offset & 0x7fffffff, + (ino_t) dbp->d_ino, + DT_UNKNOWN)) { + goto done; + } + size -= dbp->d_reclen; + curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; + dbp = nextdp(dbp); + } + } +done: + if (!error) { + if (size == 0) + filp->f_pos = uio.uio_offset & 0x7fffffff; + else if (dbp) + filp->f_pos = curr_offset; + } + + kfree(read_buf); + return -error; +} + + +STATIC int +linvfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + struct inode *ip = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va = { .va_mask = XFS_AT_UPDATIME }; + int error; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + + error = -XFS_SEND_MMAP(mp, vma, 0); + if (error) + return error; + } + + vma->vm_ops = &linvfs_file_vm_ops; + + VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return 0; +} + + +STATIC long +linvfs_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + struct inode *inode = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +STATIC long +linvfs_ioctl_invis( + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + struct inode *inode = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + ASSERT(vp); + VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +#ifdef HAVE_VMOP_MPROTECT +STATIC int +linvfs_mprotect( + struct vm_area_struct *vma, + unsigned int newflags) +{ + vnode_t *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode); + int error = 0; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + if ((vma->vm_flags & VM_MAYSHARE) && + (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) { + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + + error = XFS_SEND_MMAP(mp, vma, VM_WRITE); + } + } + return error; +} +#endif /* HAVE_VMOP_MPROTECT */ + +#ifdef HAVE_FOP_OPEN_EXEC +/* If the user is attempting to execute a file that is offline then + * we have to trigger a DMAPI READ event before the file is marked as busy + * otherwise the invisible I/O will not be able to write to the file to bring + * it back online. + */ +STATIC int +linvfs_open_exec( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + int error = 0; + bhv_desc_t *bdp; + xfs_inode_t *ip; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); + if (!bdp) { + error = -EINVAL; + goto open_exec_out; + } + ip = XFS_BHVTOI(bdp); + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) { + error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, + 0, 0, 0, NULL); + } + } +open_exec_out: + return error; +} +#endif /* HAVE_FOP_OPEN_EXEC */ + +struct file_operations linvfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .readv = linvfs_readv, + .writev = linvfs_writev, + .aio_read = linvfs_aio_read, + .aio_write = linvfs_aio_write, + .sendfile = linvfs_sendfile, + .unlocked_ioctl = linvfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_compat_ioctl, +#endif + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +#ifdef HAVE_FOP_OPEN_EXEC + .open_exec = linvfs_open_exec, +#endif +}; + +struct file_operations linvfs_invis_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .readv = linvfs_readv_invis, + .writev = linvfs_writev_invis, + .aio_read = linvfs_aio_read_invis, + .aio_write = linvfs_aio_write_invis, + .sendfile = linvfs_sendfile, + .unlocked_ioctl = linvfs_ioctl_invis, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_compat_invis_ioctl, +#endif + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +}; + + +struct file_operations linvfs_dir_operations = { + .read = generic_read_dir, + .readdir = linvfs_readdir, + .unlocked_ioctl = linvfs_ioctl, + .fsync = linvfs_fsync, +}; + +static struct vm_operations_struct linvfs_file_vm_ops = { + .nopage = filemap_nopage, + .populate = filemap_populate, +#ifdef HAVE_VMOP_MPROTECT + .mprotect = linvfs_mprotect, +#endif +}; diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c new file mode 100644 index 00000000000..05ebd30ec96 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +/* + * Stub for no-op vnode operations that return error status. + */ +int +fs_noerr(void) +{ + return 0; +} + +/* + * Operation unsupported under this file system. + */ +int +fs_nosys(void) +{ + return ENOSYS; +} + +/* + * Stub for inactive, strategy, and read/write lock/unlock. Does nothing. + */ +/* ARGSUSED */ +void +fs_noval(void) +{ +} + +/* + * vnode pcache layer for vnode_tosspages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_tosspages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) + truncate_inode_pages(ip->i_mapping, first); +} + + +/* + * vnode pcache layer for vnode_flushinval_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_flushinval_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatawrite(ip->i_mapping); + filemap_fdatawait(ip->i_mapping); + + truncate_inode_pages(ip->i_mapping, first); + } +} + +/* + * vnode pcache layer for vnode_flush_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +int +fs_flush_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + uint64_t flags, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatawrite(ip->i_mapping); + filemap_fdatawait(ip->i_mapping); + } + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h new file mode 100644 index 00000000000..2db9ddbd456 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_fs_subr.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUBR_H__ +#define __XFS_SUBR_H__ + +/* + * Utilities shared among file system implementations. + */ + +struct cred; + +extern int fs_noerr(void); +extern int fs_nosys(void); +extern void fs_noval(void); +extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); + +#endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c new file mode 100644 index 00000000000..a6da5b4fd24 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +#include "xfs.h" +#include "xfs_cred.h" +#include "xfs_sysctl.h" + +/* + * System memory size - used to scale certain data structures in XFS. + */ +unsigned long xfs_physmem; + +/* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, + * other XFS code uses these values. Times are measured in centisecs (i.e. + * 100ths of a second). + */ +xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .restrict_chown = { 0, 1, 1 }, + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, + .panic_mask = { 0, 0, 127 }, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, + .inherit_sync = { 0, 1, 1 }, + .inherit_nodump = { 0, 1, 1 }, + .inherit_noatim = { 0, 1, 1 }, + .xfs_buf_timer = { 100/2, 1*100, 30*100 }, + .xfs_buf_age = { 1*100, 15*100, 7200*100}, + .inherit_nosym = { 0, 0, 1 }, + .rotorstep = { 1, 1, 255 }, +}; + +/* + * Global system credential structure. + */ +cred_t sys_cred_val, *sys_cred = &sys_cred_val; + diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h new file mode 100644 index 00000000000..e81e2f38a85 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_globals.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_GLOBALS_H__ +#define __XFS_GLOBALS_H__ + +/* + * This file declares globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +extern uint64_t xfs_panic_mask; /* set to cause more panics */ +extern unsigned long xfs_physmem; +extern struct cred *sys_cred; + +#endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c new file mode 100644 index 00000000000..69809eef8a5 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -0,0 +1,1336 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_dfrag.h" +#include "xfs_fsops.h" + +#include +#include +#include +#include + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +STATIC int +xfs_find_handle( + unsigned int cmd, + void __user *arg) +{ + int hsize; + xfs_handle_t handle; + xfs_fsop_handlereq_t hreq; + struct inode *inode; + struct vnode *vp; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + + memset((char *)&handle, 0, sizeof(handle)); + + switch (cmd) { + case XFS_IOC_PATH_TO_FSHANDLE: + case XFS_IOC_PATH_TO_HANDLE: { + struct nameidata nd; + int error; + + error = user_path_walk_link((const char __user *)hreq.path, &nd); + if (error) + return error; + + ASSERT(nd.dentry); + ASSERT(nd.dentry->d_inode); + inode = igrab(nd.dentry->d_inode); + path_release(&nd); + break; + } + + case XFS_IOC_FD_TO_HANDLE: { + struct file *file; + + file = fget(hreq.fd); + if (!file) + return -EBADF; + + ASSERT(file->f_dentry); + ASSERT(file->f_dentry->d_inode); + inode = igrab(file->f_dentry->d_inode); + fput(file); + break; + } + + default: + ASSERT(0); + return -XFS_ERROR(EINVAL); + } + + if (inode->i_sb->s_magic != XFS_SB_MAGIC) { + /* we're not in XFS anymore, Toto */ + iput(inode); + return -XFS_ERROR(EINVAL); + } + + /* we need the vnode */ + vp = LINVFS_GET_VP(inode); + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + iput(inode); + return -XFS_ERROR(EBADF); + } + + /* now we can grab the fsid */ + memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t)); + hsize = sizeof(xfs_fsid_t); + + if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { + xfs_inode_t *ip; + bhv_desc_t *bhv; + int lock_mode; + + /* need to get access to the xfs_inode to read the generation */ + bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); + ASSERT(bhv); + ip = XFS_BHVTOI(bhv); + ASSERT(ip); + lock_mode = xfs_ilock_map_shared(ip); + + /* fill in fid section of handle from inode */ + handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.xfs_fid_len); + handle.ha_fid.xfs_fid_pad = 0; + handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen; + handle.ha_fid.xfs_fid_ino = ip->i_ino; + + xfs_iunlock_map_shared(ip, lock_mode); + + hsize = XFS_HSIZE(handle); + } + + /* now copy our handle into the user buffer & write out the size */ + if (copy_to_user(hreq.ohandle, &handle, hsize) || + copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { + iput(inode); + return -XFS_ERROR(EFAULT); + } + + iput(inode); + return 0; +} + + +/* + * Convert userspace handle data into vnode (and inode). + * We [ab]use the fact that all the fsop_handlereq ioctl calls + * have a data structure argument whose first component is always + * a xfs_fsop_handlereq_t, so we can cast to and from this type. + * This allows us to optimise the copy_from_user calls and gives + * a handy, shared routine. + * + * If no error, caller must always VN_RELE the returned vp. + */ +STATIC int +xfs_vget_fsop_handlereq( + xfs_mount_t *mp, + struct inode *parinode, /* parent inode pointer */ + xfs_fsop_handlereq_t *hreq, + vnode_t **vp, + struct inode **inode) +{ + void __user *hanp; + size_t hlen; + xfs_fid_t *xfid; + xfs_handle_t *handlep; + xfs_handle_t handle; + xfs_inode_t *ip; + struct inode *inodep; + vnode_t *vpp; + xfs_ino_t ino; + __u32 igen; + int error; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(parinode->i_mode)) + return XFS_ERROR(ENOTDIR); + + hanp = hreq->ihandle; + hlen = hreq->ihandlen; + handlep = &handle; + + if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) + return XFS_ERROR(EINVAL); + if (copy_from_user(handlep, hanp, hlen)) + return XFS_ERROR(EFAULT); + if (hlen < sizeof(*handlep)) + memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); + if (hlen > sizeof(handlep->ha_fsid)) { + if (handlep->ha_fid.xfs_fid_len != + (hlen - sizeof(handlep->ha_fsid) + - sizeof(handlep->ha_fid.xfs_fid_len)) + || handlep->ha_fid.xfs_fid_pad) + return XFS_ERROR(EINVAL); + } + + /* + * Crack the handle, obtain the inode # & generation # + */ + xfid = (struct xfs_fid *)&handlep->ha_fid; + if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) { + ino = xfid->xfs_fid_ino; + igen = xfid->xfs_fid_gen; + } else { + return XFS_ERROR(EINVAL); + } + + /* + * Get the XFS inode, building a vnode to go with it. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); + if (error) + return error; + if (ip == NULL) + return XFS_ERROR(EIO); + if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + return XFS_ERROR(ENOENT); + } + + vpp = XFS_ITOV(ip); + inodep = LINVFS_GET_IP(vpp); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + *vp = vpp; + *inode = inodep; + return 0; +} + +STATIC int +xfs_open_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + int new_fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + vnode_t *vp; + xfs_fsop_handlereq_t hreq; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + iput(inode); + return -XFS_ERROR(EINVAL); + } + +#if BITS_PER_LONG != 32 + hreq.oflags |= O_LARGEFILE; +#endif + /* Put open permission in namei format. */ + permflag = hreq.oflags; + if ((permflag+1) & O_ACCMODE) + permflag++; + if (permflag & O_TRUNC) + permflag |= 2; + + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (permflag & FMODE_WRITE) && IS_APPEND(inode)) { + iput(inode); + return -XFS_ERROR(EPERM); + } + + if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + iput(inode); + return -XFS_ERROR(EACCES); + } + + /* Can't write directories. */ + if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + iput(inode); + return -XFS_ERROR(EISDIR); + } + + if ((new_fd = get_unused_fd()) < 0) { + iput(inode); + return new_fd; + } + + dentry = d_alloc_anon(inode); + if (dentry == NULL) { + iput(inode); + put_unused_fd(new_fd); + return -XFS_ERROR(ENOMEM); + } + + /* Ensure umount returns EBUSY on umounts while this file is open. */ + mntget(parfilp->f_vfsmnt); + + /* Create file pointer. */ + filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); + if (IS_ERR(filp)) { + put_unused_fd(new_fd); + return -XFS_ERROR(-PTR_ERR(filp)); + } + if (inode->i_mode & S_IFREG) + filp->f_op = &linvfs_invis_file_operations; + + fd_install(new_fd, filp); + return new_fd; +} + +STATIC int +xfs_readlink_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct iovec aiov; + struct uio auio; + struct inode *inode; + xfs_fsop_handlereq_t hreq; + vnode_t *vp; + __u32 olen; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict this handle operation to symlinks only. */ + if (vp->v_type != VLNK) { + VN_RELE(vp); + return -XFS_ERROR(EINVAL); + } + + if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + aiov.iov_len = olen; + aiov.iov_base = hreq.ohandle; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = olen; + + VOP_READLINK(vp, &auio, IO_INVIS, NULL, error); + + VN_RELE(vp); + return (olen - auio.uio_resid); +} + +STATIC int +xfs_fssetdm_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct inode *inode; + bhv_desc_t *bdp; + vnode_t *vp; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode); + if (error) + return -error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + VN_RELE(vp); + return -XFS_ERROR(EPERM); + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + + bdp = bhv_base_unlocked(VN_BHV_HEAD(vp)); + error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL); + + VN_RELE(vp); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_attrlist_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct inode *inode; + vnode_t *vp; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, + &vp, &inode); + if (error) + goto out; + + kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + if (!kbuf) + goto out_vn_rele; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags, + cursor, NULL, error); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + + out_kfree: + kfree(kbuf); + out_vn_rele: + VN_RELE(vp); + out: + return -error; +} + +STATIC int +xfs_attrmulti_attr_get( + struct vnode *vp, + char *name, + char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + char *kbuf; + int error = EFAULT; + + if (*len > XATTR_SIZE_MAX) + return EINVAL; + kbuf = kmalloc(*len, GFP_KERNEL); + if (!kbuf) + return ENOMEM; + + VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = EFAULT; + + out_kfree: + kfree(kbuf); + return error; +} + +STATIC int +xfs_attrmulti_attr_set( + struct vnode *vp, + char *name, + const char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + char *kbuf; + int error = EFAULT; + + if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + return EPERM; + if (len > XATTR_SIZE_MAX) + return EINVAL; + + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) + return ENOMEM; + + if (copy_from_user(kbuf, ubuf, len)) + goto out_kfree; + + VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error); + + out_kfree: + kfree(kbuf); + return error; +} + +STATIC int +xfs_attrmulti_attr_remove( + struct vnode *vp, + char *name, + __uint32_t flags) +{ + int error; + + if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + return EPERM; + + VOP_ATTR_REMOVE(vp, name, flags, NULL, error); + return error; +} + +STATIC int +xfs_attrmulti_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct inode *inode; + vnode_t *vp; + unsigned int i, size; + char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode); + if (error) + goto out; + + error = E2BIG; + size = am_hreq.opcount * sizeof(attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_vn_rele; + + error = ENOMEM; + ops = kmalloc(size, GFP_KERNEL); + if (!ops) + goto out_vn_rele; + + error = EFAULT; + if (copy_from_user(ops, am_hreq.ops, size)) + goto out_kfree_ops; + + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get(vp, + attr_name, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = xfs_attrmulti_attr_set(vp, + attr_name, ops[i].am_attrvalue, + ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = xfs_attrmulti_attr_remove(vp, + attr_name, ops[i].am_flags); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_vn_rele: + VN_RELE(vp); + out: + return -error; +} + +/* prototypes for a few of the stack-hungry cases that have + * their own functions. Functions are defined after their use + * so gcc doesn't get fancy and inline them with -03 */ + +STATIC int +xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + int flags, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg); + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg); + +STATIC int +xfs_ioc_xattr( + vnode_t *vp, + xfs_inode_t *ip, + struct file *filp, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + int flags, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_getbmapx( + bhv_desc_t *bdp, + void __user *arg); + +int +xfs_ioctl( + bhv_desc_t *bdp, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + int error; + vnode_t *vp; + xfs_inode_t *ip; + xfs_mount_t *mp; + + vp = LINVFS_GET_VP(inode); + + vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + switch (cmd) { + + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); + + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = 1 << target->pbr_sshift; + /* The size dio will do in one go */ + da.d_maxiosz = 64 * PAGE_CACHE_SIZE; + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + case XFS_IOC_GETXFLAGS: + case XFS_IOC_SETXFLAGS: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_xattr(vp, ip, filp, cmd, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate, + NULL); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(bdp, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + return xfs_find_handle(cmd, arg); + + case XFS_IOC_OPEN_BY_HANDLE: + return xfs_open_by_handle(mp, arg, filp, inode); + + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(mp, arg, filp, inode); + + case XFS_IOC_READLINK_BY_HANDLE: + return xfs_readlink_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(mp, arg, filp, inode); + + case XFS_IOC_SWAPEXT: { + error = xfs_swapext((struct xfs_swapext __user *)arg); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + return -error; + } + + case XFS_IOC_FREEZE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_sb->s_frozen == SB_UNFROZEN) + freeze_bdev(inode->i_sb->s_bdev); + return 0; + + case XFS_IOC_THAW: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (inode->i_sb->s_frozen != SB_UNFROZEN) + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); + return 0; + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp); + return -error; + + default: + return -ENOTTY; + } +} + +STATIC int +xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + xfs_flock64_t bf; + int attr_flags = 0; + int error; + + if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + return -XFS_ERROR(EPERM); + + if (!(filp->f_flags & FMODE_WRITE)) + return -XFS_ERROR(EBADF); + + if (vp->v_type != VREG) + return -XFS_ERROR(EINVAL); + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + if (ioflags & IO_INVIS) + attr_flags |= ATTR_DMI; + + error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, + NULL, attr_flags); + return -error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -XFS_ERROR(EFAULT); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + else { /* XFS_IOC_FSBULKSTAT */ + if (count == 1 && inlast != 0) { + inlast++; + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + } else { + error = xfs_bulkstat(mp, &inlast, &count, + (bulkstat_one_pf)xfs_bulkstat_one, NULL, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); + } + } + + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_v1_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* + * Linux extended inode flags interface. + */ +#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */ +#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */ +#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */ +#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ +#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & LINUX_XFLAG_IMMUTABLE) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & LINUX_XFLAG_APPEND) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & LINUX_XFLAG_SYNC) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & LINUX_XFLAG_NOATIME) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & LINUX_XFLAG_NODUMP) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= LINUX_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= LINUX_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= LINUX_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= LINUX_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= LINUX_XFLAG_NODUMP; + return flags; +} + +STATIC int +xfs_ioc_xattr( + vnode_t *vp, + xfs_inode_t *ip, + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct fsxattr fa; + vattr_t va; + int error; + int attr_flags; + unsigned int flags; + + switch (cmd) { + case XFS_IOC_FSGETXATTR: { + va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_nextents; + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSSETXATTR: { + if (copy_from_user(&fa, arg, sizeof(fa))) + return -XFS_ERROR(EFAULT); + + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + + va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE; + va.va_xflags = fa.fsx_xflags; + va.va_extsize = fa.fsx_extsize; + + VOP_SETATTR(vp, &va, attr_flags, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return -error; + } + + case XFS_IOC_FSGETXATTRA: { + va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_ANEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_anextents; + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GETXFLAGS: { + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SETXFLAGS: { + if (copy_from_user(&flags, arg, sizeof(flags))) + return -XFS_ERROR(EFAULT); + + if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \ + LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \ + LINUX_XFLAG_SYNC)) + return -XFS_ERROR(EOPNOTSUPP); + + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + + va.va_mask = XFS_AT_XFLAGS; + va.va_xflags = xfs_merge_ioc_xflags(flags, + xfs_ip2xflags(ip)); + + VOP_SETATTR(vp, &va, attr_flags, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return -error; + } + + case XFS_IOC_GETVERSION: { + flags = LINVFS_GET_IP(vp)->i_generation; + if (copy_to_user(arg, &flags, sizeof(flags))) + return -XFS_ERROR(EFAULT); + return 0; + } + + default: + return -ENOTTY; + } +} + +STATIC int +xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bm, arg, sizeof(bm))) + return -XFS_ERROR(EFAULT); + + if (bm.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & IO_INVIS) + iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags); + if (error) + return -error; + + if (copy_to_user(arg, &bm, sizeof(bm))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + bhv_desc_t *bdp, + void __user *arg) +{ + struct getbmapx bmx; + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + /* + * Map input getbmapx structure to a getbmap + * structure for xfs_getbmap. + */ + GETBMAP_CONVERT(bmx, bm); + + iflags = bmx.bmv_iflags; + + if (iflags & (~BMV_IF_VALID)) + return -XFS_ERROR(EINVAL); + + iflags |= BMV_IF_EXTENDED; + + error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags); + if (error) + return -error; + + GETBMAP_CONVERT(bm, bmx); + + if (copy_to_user(arg, &bmx, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c new file mode 100644 index 00000000000..7a12c83184f --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_fs.h" +#include "xfs_vfs.h" +#include "xfs_vnode.h" +#include "xfs_dfrag.h" + +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#else + +typedef struct xfs_fsop_bulkreq32 { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + __s32 ocount; /* output count pointer */ +} xfs_fsop_bulkreq32_t; + +static unsigned long +xfs_ioctl32_bulkstat(unsigned long arg) +{ + xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; + xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p)); + u32 addr; + + if (get_user(addr, &p32->lastip) || + put_user(compat_ptr(addr), &p->lastip) || + copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || + get_user(addr, &p32->ubuffer) || + put_user(compat_ptr(addr), &p->ubuffer) || + get_user(addr, &p32->ocount) || + put_user(compat_ptr(addr), &p->ocount)) + return -EFAULT; + + return (unsigned long)p; +} +#endif + +static long +__xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg) +{ + int error; + struct inode *inode = f->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + switch (cmd) { + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_GETVERSION: + case XFS_IOC_GETXFLAGS: + case XFS_IOC_SETXFLAGS: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: +/* not handled + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + case XFS_IOC_OPEN_BY_HANDLE: + case XFS_IOC_FSSETDM_BY_HANDLE: + case XFS_IOC_READLINK_BY_HANDLE: + case XFS_IOC_ATTRLIST_BY_HANDLE: + case XFS_IOC_ATTRMULTI_BY_HANDLE: +*/ + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_FREEZE: + case XFS_IOC_THAW: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + break; + +#ifndef BROKEN_X86_ALIGNMENT + /* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_SWAPEXT: + break; + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + arg = xfs_ioctl32_bulkstat(arg); + break; +#endif + default: + return -ENOIOCTLCMD; + } + + VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error); + VMODIFY(vp); + + return error; +} + +long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg) +{ + return __xfs_compat_ioctl(0, f, cmd, arg); +} + +long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg) +{ + return __xfs_compat_ioctl(IO_INVIS, f, cmd, arg); +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h new file mode 100644 index 00000000000..779f69a4811 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg); +long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c new file mode 100644 index 00000000000..407e9935939 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" + +#include +#include + + +/* + * Pull the link count and size up from the xfs inode to the linux inode + */ +STATIC void +validate_fields( + struct inode *ip) +{ + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va; + int error; + + va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS; + VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error); + if (likely(!error)) { + ip->i_nlink = va.va_nlink; + ip->i_blocks = va.va_nblocks; + + /* we're under i_sem so i_size can't change under us */ + if (i_size_read(ip) != va.va_size) + i_size_write(ip, va.va_size); + } +} + +/* + * Determine whether a process has a valid fs_struct (kernel daemons + * like knfsd don't have an fs_struct). + * + * XXX(hch): nfsd is broken, better fix it instead. + */ +STATIC inline int +has_fs_struct(struct task_struct *task) +{ + return (task->fs != init_task.fs); +} + +STATIC int +linvfs_mknod( + struct inode *dir, + struct dentry *dentry, + int mode, + dev_t rdev) +{ + struct inode *ip; + vattr_t va; + vnode_t *vp = NULL, *dvp = LINVFS_GET_VP(dir); + xfs_acl_t *default_acl = NULL; + attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff) + return -EINVAL; + + if (test_default_acl && test_default_acl(dvp)) { + if (!_ACL_ALLOC(default_acl)) + return -ENOMEM; + if (!_ACL_GET_DEFAULT(dvp, default_acl)) { + _ACL_FREE(default_acl); + default_acl = NULL; + } + } + + if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current)) + mode &= ~current->fs->umask; + + memset(&va, 0, sizeof(va)); + va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; + va.va_type = IFTOVT(mode); + va.va_mode = mode; + + switch (mode & S_IFMT) { + case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + va.va_rdev = sysv_encode_dev(rdev); + va.va_mask |= XFS_AT_RDEV; + /*FALLTHROUGH*/ + case S_IFREG: + VOP_CREATE(dvp, dentry, &va, &vp, NULL, error); + break; + case S_IFDIR: + VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error); + break; + default: + error = EINVAL; + break; + } + + if (default_acl) { + if (!error) { + error = _ACL_INHERIT(vp, &va, default_acl); + if (!error) { + VMODIFY(vp); + } else { + struct dentry teardown = {}; + int err2; + + /* Oh, the horror. + * If we can't add the ACL we must back out. + * ENOSPC can hit here, among other things. + */ + teardown.d_inode = ip = LINVFS_GET_IP(vp); + teardown.d_name = dentry->d_name; + + vn_mark_bad(vp); + + if (S_ISDIR(mode)) + VOP_RMDIR(dvp, &teardown, NULL, err2); + else + VOP_REMOVE(dvp, &teardown, NULL, err2); + VN_RELE(vp); + } + } + _ACL_FREE(default_acl); + } + + if (!error) { + ASSERT(vp); + ip = LINVFS_GET_IP(vp); + + if (S_ISCHR(mode) || S_ISBLK(mode)) + ip->i_rdev = rdev; + else if (S_ISDIR(mode)) + validate_fields(ip); + d_instantiate(dentry, ip); + validate_fields(dir); + } + return -error; +} + +STATIC int +linvfs_create( + struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + return linvfs_mknod(dir, dentry, mode, 0); +} + +STATIC int +linvfs_mkdir( + struct inode *dir, + struct dentry *dentry, + int mode) +{ + return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +linvfs_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct vnode *vp = LINVFS_GET_VP(dir), *cvp; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error); + if (error) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(LINVFS_GET_IP(cvp), dentry); +} + +STATIC int +linvfs_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *ip; /* inode of guy being linked to */ + vnode_t *tdvp; /* target directory for new name/link */ + vnode_t *vp; /* vp of name being linked */ + int error; + + ip = old_dentry->d_inode; /* inode being linked to */ + if (S_ISDIR(ip->i_mode)) + return -EPERM; + + tdvp = LINVFS_GET_VP(dir); + vp = LINVFS_GET_VP(ip); + + VOP_LINK(tdvp, vp, dentry, NULL, error); + if (!error) { + VMODIFY(tdvp); + VN_HOLD(vp); + validate_fields(ip); + d_instantiate(dentry, ip); + } + return -error; +} + +STATIC int +linvfs_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode; + vnode_t *dvp; /* directory containing name to remove */ + int error; + + inode = dentry->d_inode; + dvp = LINVFS_GET_VP(dir); + + VOP_REMOVE(dvp, dentry, NULL, error); + if (!error) { + validate_fields(dir); /* For size only */ + validate_fields(inode); + } + + return -error; +} + +STATIC int +linvfs_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *ip; + vattr_t va; + vnode_t *dvp; /* directory containing name of symlink */ + vnode_t *cvp; /* used to lookup symlink to put in dentry */ + int error; + + dvp = LINVFS_GET_VP(dir); + cvp = NULL; + + memset(&va, 0, sizeof(va)); + va.va_type = VLNK; + va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO; + va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; + + error = 0; + VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error); + if (!error && cvp) { + ASSERT(cvp->v_type == VLNK); + ip = LINVFS_GET_IP(cvp); + d_instantiate(dentry, ip); + validate_fields(dir); + validate_fields(ip); /* size needs update */ + } + return -error; +} + +STATIC int +linvfs_rmdir( + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + vnode_t *dvp = LINVFS_GET_VP(dir); + int error; + + VOP_RMDIR(dvp, dentry, NULL, error); + if (!error) { + validate_fields(inode); + validate_fields(dir); + } + return -error; +} + +STATIC int +linvfs_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry) +{ + struct inode *new_inode = ndentry->d_inode; + vnode_t *fvp; /* from directory */ + vnode_t *tvp; /* target directory */ + int error; + + fvp = LINVFS_GET_VP(odir); + tvp = LINVFS_GET_VP(ndir); + + VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error); + if (error) + return -error; + + if (new_inode) + validate_fields(new_inode); + + validate_fields(odir); + if (ndir != odir) + validate_fields(ndir); + return 0; +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC int +linvfs_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + vnode_t *vp; + uio_t *uio; + iovec_t iov; + int error; + char *link; + + ASSERT(dentry); + ASSERT(nd); + + link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); + if (!link) { + nd_set_link(nd, ERR_PTR(-ENOMEM)); + return 0; + } + + uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); + if (!uio) { + kfree(link); + nd_set_link(nd, ERR_PTR(-ENOMEM)); + return 0; + } + + vp = LINVFS_GET_VP(dentry->d_inode); + + iov.iov_base = link; + iov.iov_len = MAXNAMELEN; + + uio->uio_iov = &iov; + uio->uio_offset = 0; + uio->uio_segflg = UIO_SYSSPACE; + uio->uio_resid = MAXNAMELEN; + uio->uio_iovcnt = 1; + + VOP_READLINK(vp, uio, 0, NULL, error); + if (error) { + kfree(link); + link = ERR_PTR(-error); + } else { + link[MAXNAMELEN - uio->uio_resid] = '\0'; + } + kfree(uio); + + nd_set_link(nd, link); + return 0; +} + +static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +#ifdef CONFIG_XFS_POSIX_ACL +STATIC int +linvfs_permission( + struct inode *inode, + int mode, + struct nameidata *nd) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + mode <<= 6; /* convert from linux to vnode access bits */ + VOP_ACCESS(vp, mode, NULL, error); + return -error; +} +#else +#define linvfs_permission NULL +#endif + +STATIC int +linvfs_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0; + + if (unlikely(vp->v_flag & VMODIFIED)) + error = vn_revalidate(vp); + if (!error) + generic_fillattr(inode, stat); + return 0; +} + +STATIC int +linvfs_setattr( + struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + unsigned int ia_valid = attr->ia_valid; + vnode_t *vp = LINVFS_GET_VP(inode); + vattr_t vattr; + int flags = 0; + int error; + + memset(&vattr, 0, sizeof(vattr_t)); + if (ia_valid & ATTR_UID) { + vattr.va_mask |= XFS_AT_UID; + vattr.va_uid = attr->ia_uid; + } + if (ia_valid & ATTR_GID) { + vattr.va_mask |= XFS_AT_GID; + vattr.va_gid = attr->ia_gid; + } + if (ia_valid & ATTR_SIZE) { + vattr.va_mask |= XFS_AT_SIZE; + vattr.va_size = attr->ia_size; + } + if (ia_valid & ATTR_ATIME) { + vattr.va_mask |= XFS_AT_ATIME; + vattr.va_atime = attr->ia_atime; + } + if (ia_valid & ATTR_MTIME) { + vattr.va_mask |= XFS_AT_MTIME; + vattr.va_mtime = attr->ia_mtime; + } + if (ia_valid & ATTR_CTIME) { + vattr.va_mask |= XFS_AT_CTIME; + vattr.va_ctime = attr->ia_ctime; + } + if (ia_valid & ATTR_MODE) { + vattr.va_mask |= XFS_AT_MODE; + vattr.va_mode = attr->ia_mode; + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + inode->i_mode &= ~S_ISGID; + } + + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) + flags |= ATTR_UTIME; +#ifdef ATTR_NO_BLOCK + if ((ia_valid & ATTR_NO_BLOCK)) + flags |= ATTR_NONBLOCK; +#endif + + VOP_SETATTR(vp, &vattr, flags, NULL, error); + if (error) + return -error; + vn_revalidate(vp); + return error; +} + +STATIC void +linvfs_truncate( + struct inode *inode) +{ + block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block); +} + +STATIC int +linvfs_setxattr( + struct dentry *dentry, + const char *name, + const void *data, + size_t size, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + int error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + xflags |= namesp->attr_flag; + return namesp->attr_set(vp, attr, (void *)data, size, xflags); +} + +STATIC ssize_t +linvfs_getxattr( + struct dentry *dentry, + const char *name, + void *data, + size_t size) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + ssize_t error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + data = NULL; + } + xflags |= namesp->attr_flag; + return namesp->attr_get(vp, attr, (void *)data, size, xflags); +} + +STATIC ssize_t +linvfs_listxattr( + struct dentry *dentry, + char *data, + size_t size) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + int error, xflags = ATTR_KERNAMELS; + ssize_t result; + + if (!size) + xflags |= ATTR_KERNOVAL; + xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS; + + error = attr_generic_list(vp, data, size, xflags, &result); + if (error < 0) + return error; + return result; +} + +STATIC int +linvfs_removexattr( + struct dentry *dentry, + const char *name) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + int error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + xflags |= namesp->attr_flag; + return namesp->attr_remove(vp, attr, xflags); +} + + +struct inode_operations linvfs_file_inode_operations = { + .permission = linvfs_permission, + .truncate = linvfs_truncate, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_dir_inode_operations = { + .create = linvfs_create, + .lookup = linvfs_lookup, + .link = linvfs_link, + .unlink = linvfs_unlink, + .symlink = linvfs_symlink, + .mkdir = linvfs_mkdir, + .rmdir = linvfs_rmdir, + .mknod = linvfs_mknod, + .rename = linvfs_rename, + .permission = linvfs_permission, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = linvfs_follow_link, + .put_link = linvfs_put_link, + .permission = linvfs_permission, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h new file mode 100644 index 00000000000..6a69a62c36b --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +extern struct inode_operations linvfs_file_inode_operations; +extern struct inode_operations linvfs_dir_inode_operations; +extern struct inode_operations linvfs_symlink_inode_operations; + +extern struct file_operations linvfs_file_operations; +extern struct file_operations linvfs_invis_file_operations; +extern struct file_operations linvfs_dir_operations; + +extern struct address_space_operations linvfs_aops; + +extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void linvfs_unwritten_done(struct buffer_head *, int); + +extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, + int, unsigned int, void __user *); + +#endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h new file mode 100644 index 00000000000..71bb41019a1 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include +#include + +/* + * Some types are conditional depending on the target system. + * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. + * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well + * as requiring XFS_BIG_BLKNOS to be set. + */ +#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# define XFS_BIG_BLKNOS 1 +# if BITS_PER_LONG == 64 +# define XFS_BIG_INUMS 1 +# else +# define XFS_BIG_INUMS 0 +# endif +#else +# define XFS_BIG_BLKNOS 0 +# define XFS_BIG_INUMS 0 +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Feature macros (disable/enable) + */ +#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ +#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ + +/* + * State flag for unwritten extent buffers. + * + * We need to be able to distinguish between these and delayed + * allocate buffers within XFS. The generic IO path code does + * not need to distinguish - we use the BH_Delay flag for both + * delalloc and these ondisk-uninitialised buffers. + */ +BUFFER_FNS(PrivateStart, unwritten); +static inline void set_buffer_unwritten_io(struct buffer_head *bh) +{ + bh->b_end_io = linvfs_unwritten_done; +} + +#define restricted_chown xfs_params.restrict_chown.val +#define irix_sgid_inherit xfs_params.sgid_inherit.val +#define irix_symlink_mode xfs_params.symlink_mode.val +#define xfs_panic_mask xfs_params.panic_mask.val +#define xfs_error_level xfs_params.error_level.val +#define xfs_syncd_centisecs xfs_params.syncd_timer.val +#define xfs_stats_clear xfs_params.stats_clear.val +#define xfs_inherit_sync xfs_params.inherit_sync.val +#define xfs_inherit_nodump xfs_params.inherit_nodump.val +#define xfs_inherit_noatime xfs_params.inherit_noatim.val +#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val +#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val +#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val +#define xfs_rotorstep xfs_params.rotorstep.val + +#ifndef __smp_processor_id +#define __smp_processor_id() smp_processor_id() +#endif +#define current_cpu() __smp_processor_id() +#define current_pid() (current->pid) +#define current_fsuid(cred) (current->fsuid) +#define current_fsgid(cred) (current->fsgid) + +#define NBPP PAGE_SIZE +#define DPPSHFT (PAGE_SHIFT - 9) +#define NDPP (1 << (PAGE_SHIFT - 9)) +#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) +#define dtopt(DD) ((DD) >> DPPSHFT) +#define dpoff(DD) ((DD) & (NDPP-1)) + +#define NBBY 8 /* number of bits per byte */ +#define NBPC PAGE_SIZE /* Number of bytes per click */ +#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT BPCSHIFT +#define BLKDEV_IOSIZE (1<>BPCSHIFT) +#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) +#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) +#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) +#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) + +/* off_t bytes to clicks */ +#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) + +/* clicks to off_t bytes */ +#define ctooff(x) ((xfs_off_t)(x)<>BPCSHIFT) +#define ctob64(x) ((__uint64_t)(x)<>BPCSHIFT) + +#ifndef CELL_CAPABLE +#define FSC_NOTIFY_NAME_CHANGED(vp) +#endif + +#ifndef ENOATTR +#define ENOATTR ENODATA /* Attribute not found */ +#endif + +/* Note: EWRONGFS never visible outside the kernel */ +#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ + +/* + * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't + * return codes out of its known range in errno. + * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't + * conflict with any code we use already or any code a driver may use) + * XXX Some options (currently we do #2): + * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated] + * 2/ 990 ["Unknown error 990"] + * 3/ EUCLEAN ["Structure needs cleaning"] + * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace] + */ +#define EFSCORRUPTED 990 /* Filesystem is corrupted */ + +#define SYNCHRONIZE() barrier() +#define __return_address __builtin_return_address(0) + +/* + * IRIX (BSD) quotactl makes use of separate commands for user/group, + * whereas on Linux the syscall encodes this information into the cmd + * field (see the QCMD macro in quota.h). These macros help keep the + * code portable - they are not visible from the syscall interface. + */ +#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */ +#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */ + +/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */ +/* we may well need to fine-tune this if it ever becomes an issue. */ +#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */ +#define ndquot DQUOT_MAX_HEURISTIC + +/* IRIX uses the current size of the name cache to guess a good value */ +/* - this isn't the same but is a good enough starting point for now. */ +#define DQUOT_HASH_HEURISTIC files_stat.nr_files + +/* IRIX inodes maintain the project ID also, zero this field on Linux */ +#define DEFAULT_PROJID 0 +#define dfltprid DEFAULT_PROJID + +#define MAXPATHLEN 1024 + +#define MIN(a,b) (min(a,b)) +#define MAX(a,b) (max(a,b)) +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +#define xfs_stack_trace() dump_stack() + +#define xfs_itruncate_data(ip, off) \ + (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off))) + + +/* Move the kernel do_div definition off to one side */ + +#if defined __i386__ +/* For ia32 we need to pull some tricks to get past various versions + * of the compiler which do not like us using do_div in the middle + * of large functions. + */ +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return(x * y); +} + +#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL) + +#endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c new file mode 100644 index 00000000000..ff145fd0d1a --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff) + * + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" + +#include +#include + + +#if defined(XFS_RW_TRACE) +void +xfs_rw_enter_trace( + int tag, + xfs_iocore_t *io, + void *data, + size_t segs, + loff_t offset, + int ioflags) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(unsigned long)tag, + (void *)ip, + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)data, + (void *)((unsigned long)segs), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)ioflags), + (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} + +void +xfs_inval_cached_trace( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_off_t len, + xfs_off_t first, + xfs_off_t last) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(__psint_t)XFS_INVAL_CACHED, + (void *)ip, + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)((len >> 32) & 0xffffffff)), + (void *)((unsigned long)(len & 0xffffffff)), + (void *)((unsigned long)((first >> 32) & 0xffffffff)), + (void *)((unsigned long)(first & 0xffffffff)), + (void *)((unsigned long)((last >> 32) & 0xffffffff)), + (void *)((unsigned long)(last & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} +#endif + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count, /* size of data to zero */ + loff_t end_size) /* max file size to set */ +{ + unsigned bytes; + struct page *page; + struct address_space *mapping; + char *kaddr; + int status; + + mapping = ip->i_mapping; + do { + unsigned long index, offset; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + break; + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(NULL, page, offset, + offset + bytes); + if (status) { + goto unlock; + } + + memset((void *) (kaddr + offset), 0, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(NULL, page, offset, + offset + bytes); + if (!status) { + pos += bytes; + count -= bytes; + if (pos > i_size_read(ip)) + i_size_write(ip, pos < end_size ? pos : end_size); + } + +unlock: + kunmap(page); + unlock_page(page); + page_cache_release(page); + if (status) + break; + } while (count); + + return (-status); +} + +/* + * xfs_inval_cached_pages + * + * This routine is responsible for keeping direct I/O and buffered I/O + * somewhat coherent. From here we make sure that we're at least + * temporarily holding the inode I/O lock exclusively and then call + * the page cache to flush and invalidate any cached pages. If there + * are no cached pages this routine will be very quick. + */ +void +xfs_inval_cached_pages( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, + int write, + int relock) +{ + if (VN_CACHED(vp)) { + xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); + } + +} + +ssize_t /* bytes read, or (-) error */ +xfs_read( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int segs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t size = 0; + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + unsigned long seg; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((*offset & target->pbr_smask) || + (size & target->pbr_smask)) { + if (*offset == ip->i_d.di_size) { + return (0); + } + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (size == 0)) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) { + return -EIO; + } + + if (unlikely(ioflags & IO_ISDIRECT)) + down(&inode->i_sem); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + !(ioflags & IO_INVIS)) { + vrwlock_t locktype = VRWLOCK_READ; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, + BHV_TO_VNODE(bdp), *offset, size, + FILP_DELAY_FLAG(file), &locktype); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + goto unlock_isem; + } + } + + xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, + (void *)iovp, segs, *offset, ioflags); + ret = __generic_file_aio_read(iocb, iovp, segs, offset); + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + +unlock_isem: + if (unlikely(ioflags & IO_ISDIRECT)) + up(&inode->i_sem); + return ret; +} + +ssize_t +xfs_sendfile( + bhv_desc_t *bdp, + struct file *filp, + loff_t *offset, + int ioflags, + size_t count, + read_actor_t actor, + void *target, + cred_t *credp) +{ + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (count == 0)) + return 0; + + if (n < count) + count = n; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + FILP_DELAY_FLAG(filp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, + (void *)(unsigned long)target, count, *offset, ioflags); + ret = generic_file_sendfile(filp, offset, count, actor, target); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct inode *ip, + xfs_iocore_t *io, + xfs_off_t offset, + xfs_fsize_t isize, + xfs_fsize_t end_size) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp; + int nimaps; + int zero_offset; + int zero_len; + int isize_fsb_offset; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + ASSERT(offset > isize); + + mp = io->io_mount; + + isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); + if (isize_fsb_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + loff = XFS_FSB_TO_B(mp, last_fsb); + lsize = XFS_FSB_TO_B(mp, 1); + + zero_offset = isize_fsb_offset; + zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset; + + error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + xfs_fsize_t end_size) /* terminal inode size */ +{ + struct inode *ip = LINVFS_GET_IP(vp); + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t prev_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_extlen_t buf_len_fsb; + xfs_extlen_t prev_zero_count; + xfs_mount_t *mp; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + + mp = io->io_mount; + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, io, offset, isize, end_size); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + start_zero_fsb = imap.br_startoff + + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks in the range requested. + * Zero them a single write at a time. We actually + * don't zero the entire range returned if it is + * too big and simply loop around to get the rest. + * That is not the most efficient thing to do, but it + * is simple and this path should not be exercised often. + */ + buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, + mp->m_writeio_blocks << 8); + /* + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + + loff = XFS_FSB_TO_B(mp, start_zero_fsb); + lsize = XFS_FSB_TO_B(mp, buf_len_fsb); + + error = xfs_iozero(ip, loff, lsize, end_size); + + if (error) { + goto out_lock; + } + + prev_zero_fsb = start_zero_fsb; + prev_zero_count = buf_len_fsb; + start_zero_fsb = imap.br_startoff + buf_len_fsb; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + } + + return 0; + +out_lock: + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +ssize_t /* bytes written, or (-) error */ +xfs_write( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int nsegs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long segs = nsegs; + xfs_inode_t *xip; + xfs_mount_t *mp; + ssize_t ret = 0, error = 0; + xfs_fsize_t isize, new_size; + xfs_iocore_t *io; + vnode_t *vp; + unsigned long seg; + int iolock; + int eventsent = 0; + vrwlock_t locktype; + size_t ocount = 0, count; + loff_t pos; + int need_isem = 1, need_flush = 0; + + XFS_STATS_INC(xs_write_calls); + + vp = BHV_TO_VNODE(bdp); + xip = XFS_BHVTOI(bdp); + + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *offset; + + if (count == 0) + return 0; + + io = &xip->i_iocore; + mp = io->io_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE); + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->pbr_smask) || (count & target->pbr_smask)) + return XFS_ERROR(-EINVAL); + + if (!VN_CACHED(vp) && pos < i_size_read(inode)) + need_isem = 0; + + if (VN_CACHED(vp)) + need_flush = 1; + } + +relock: + if (need_isem) { + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + + down(&inode->i_sem); + } else { + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + } + + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + + isize = i_size_read(inode); + + if (file->f_flags & O_APPEND) + *offset = isize; + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + + new_size = pos + count; + if (new_size > isize) + io->io_new_size = new_size; + + if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + loff_t savedsize = pos; + int dmflags = FILP_DELAY_FLAG(file); + + if (need_isem) + dmflags |= DM_FLAGS_ISEM; + + xfs_iunlock(xip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, + pos, count, + dmflags, &locktype); + if (error) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + xfs_ilock(xip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reaquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && savedsize != isize) { + pos = isize = xip->i_d.di_size; + goto start; + } + } + + /* + * On Linux, generic_file_write updates the times even if + * no data is copied in so long as the write had a size. + * + * We must update xfs' times since revalidate will overcopy xfs. + */ + if (!(ioflags & IO_INVIS)) { + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + inode_update_time(inode, 1); + } + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > isize) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, + isize, pos + count); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + + if (((xip->i_d.di_mode & S_ISUID) || + ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP))) && + !capable(CAP_FSETID)) { + error = xfs_write_clear_setuid(xip); + if (likely(!error)) + error = -remove_suid(file->f_dentry); + if (unlikely(error)) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + } + +retry: + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (need_flush) { + xfs_inval_cached_trace(io, pos, -1, + ctooff(offtoct(pos)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)), + -1, FI_REMAPF_LOCKED); + } + + if (need_isem) { + /* demote the lock now the cached pages are gone */ + XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); + up(&inode->i_sem); + + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + need_isem = 0; + } + + xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &segs, pos, offset, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + need_isem = 1; + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(xip, iolock); + goto relock; + } + } else { + xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_buffered_write(iocb, iovp, segs, + pos, offset, count, ret); + } + + current->backing_dev_info = NULL; + + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + + if ((ret == -ENOSPC) && + DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && + !(ioflags & IO_INVIS)) { + + xfs_rwunlock(bdp, locktype); + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, + DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (error) + goto out_unlock_isem; + xfs_rwlock(bdp, locktype); + pos = xip->i_d.di_size; + ret = 0; + goto retry; + } + + if (*offset > xip->i_d.di_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_d.di_size) { + xip->i_d.di_size = *offset; + i_size_write(inode, *offset); + xip->i_update_core = 1; + xip->i_update_size = 1; + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + /* + * If we're treating this as O_DSYNC and we have not updated the + * size, force the log. + */ + if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && + !(xip->i_update_size)) { + xfs_inode_log_item_t *iip = xip->i_itemp; + + /* + * If an allocation transaction occurred + * without extending the size, then we have to force + * the log up the proper point to ensure that the + * allocation is permanent. We can't count on + * the fact that buffered writes lock out direct I/O + * writes - the direct I/O write could have extended + * the size nontransactionally, then finished before + * we started. xfs_write_file will think that the file + * didn't grow but the update isn't safe unless the + * size change is logged. + * + * Force the log if we've committed a transaction + * against the inode or if someone else has and + * the commit record hasn't gone to disk (e.g. + * the inode is pinned). This guarantees that + * all changes affecting the inode are permanent + * when we return. + */ + if (iip && iip->ili_last_lsn) { + xfs_log_force(mp, iip->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } else if (xfs_ipincount(xip) > 0) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } + + } else { + xfs_trans_t *tp; + + /* + * O_SYNC or O_DSYNC _with_ a size update are handled + * the same way. + * + * If the write was synchronous then we need to make + * sure that the inode modification time is permanent. + * We'll have updated the timestamp above, so here + * we use a synchronous transaction to log the inode. + * It's not fast, but it's necessary. + * + * If this a dsync write and the size got changed + * non-transactionally, then we need to ensure that + * the size change gets logged in a synchronous + * transaction. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); + if ((error = xfs_trans_reserve(tp, 0, + XFS_SWRITE_LOG_RES(mp), + 0, 0, 0))) { + /* Transaction reserve failed */ + xfs_trans_cancel(tp, 0); + } else { + /* Transaction reserve successful */ + xfs_ilock(xip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, xip); + xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + if (error) + goto out_unlock_internal; + } + + xfs_rwunlock(bdp, locktype); + if (need_isem) + up(&inode->i_sem); + + error = sync_page_range(inode, mapping, pos, ret); + if (!error) + error = ret; + return error; + } + + out_unlock_internal: + xfs_rwunlock(bdp, locktype); + out_unlock_isem: + if (need_isem) + up(&inode->i_sem); + return -error; +} + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb(struct xfs_buf *bp) +{ + xfs_mount_t *mp; + + mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); + if (!XFS_FORCED_SHUTDOWN(mp)) { + pagebuf_iorequest(bp); + return 0; + } else { + xfs_buftrace("XFS__BDSTRAT IOERROR", bp); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (XFS_BUF_IODONE_FUNC(bp) == NULL && + (XFS_BUF_ISREAD(bp)) == 0) + return (xfs_bioerror_relse(bp)); + else + return (xfs_bioerror(bp)); + } +} + + +int +xfs_bmap(bhv_desc_t *bdp, + xfs_off_t offset, + ssize_t count, + int flags, + xfs_iomap_t *iomapp, + int *niomaps) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_iocore_t *io = &ip->i_iocore; + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); + + return xfs_iomap(io, offset, count, flags, iomapp, niomaps); +} + +/* + * Wrapper around bdstrat so that we can stop data + * from going to disk in case we are shutting down the filesystem. + * Typically user data goes thru this path; one of the exceptions + * is the superblock. + */ +int +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + ASSERT(mp); + if (!XFS_FORCED_SHUTDOWN(mp)) { + /* Grio redirection would go here + * if (XFS_BUF_IS_GRIO(bp)) { + */ + + pagebuf_iorequest(bp); + return 0; + } + + xfs_buftrace("XFSBDSTRAT IOERROR", bp); + return (xfs_bioerror_relse(bp)); +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + xfs_mount_t *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h new file mode 100644 index 00000000000..d723e35254a --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LRW_H__ +#define __XFS_LRW_H__ + +struct vnode; +struct bhv_desc; +struct xfs_mount; +struct xfs_iocore; +struct xfs_inode; +struct xfs_bmbt_irec; +struct xfs_buf; +struct xfs_iomap; + +#if defined(XFS_RW_TRACE) +/* + * Defines for the trace mechanisms in xfs_lrw.c. + */ +#define XFS_RW_KTRACE_SIZE 128 + +#define XFS_READ_ENTER 1 +#define XFS_WRITE_ENTER 2 +#define XFS_IOMAP_READ_ENTER 3 +#define XFS_IOMAP_WRITE_ENTER 4 +#define XFS_IOMAP_READ_MAP 5 +#define XFS_IOMAP_WRITE_MAP 6 +#define XFS_IOMAP_WRITE_NOSPACE 7 +#define XFS_ITRUNC_START 8 +#define XFS_ITRUNC_FINISH1 9 +#define XFS_ITRUNC_FINISH2 10 +#define XFS_CTRUNC1 11 +#define XFS_CTRUNC2 12 +#define XFS_CTRUNC3 13 +#define XFS_CTRUNC4 14 +#define XFS_CTRUNC5 15 +#define XFS_CTRUNC6 16 +#define XFS_BUNMAPI 17 +#define XFS_INVAL_CACHED 18 +#define XFS_DIORD_ENTER 19 +#define XFS_DIOWR_ENTER 20 +#define XFS_SENDFILE_ENTER 21 +#define XFS_WRITEPAGE_ENTER 22 +#define XFS_RELEASEPAGE_ENTER 23 +#define XFS_IOMAP_ALLOC_ENTER 24 +#define XFS_IOMAP_ALLOC_MAP 25 +#define XFS_IOMAP_UNWRITTEN 26 +extern void xfs_rw_enter_trace(int, struct xfs_iocore *, + void *, size_t, loff_t, int); +extern void xfs_inval_cached_trace(struct xfs_iocore *, + xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); +#else +#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags) +#define xfs_inval_cached_trace(io, offset, len, first, last) +#endif + +/* + * Maximum count of bmaps used by read and write paths. + */ +#define XFS_MAX_RW_NBMAPS 4 + +extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, + struct xfs_iomap *, int *); +extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb(struct xfs_buf *); + +extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, + xfs_fsize_t, xfs_fsize_t); +extern void xfs_inval_cached_pages(struct vnode *, struct xfs_iocore *, + xfs_off_t, int, int); +extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, + loff_t *, int, size_t, read_actor_t, + void *, struct cred *); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +#define XFS_FSB_TO_DB_IO(io,fsb) \ + (((io)->io_flags & XFS_IOCORE_RT) ? \ + XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((io)->io_mount, (fsb))) + +#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c new file mode 100644 index 00000000000..aaf5ddba47f --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +STATIC int +xfs_read_xfsstats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int c, i, j, len, val; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + }; + + /* Loop over all stats groups */ + for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) { + len += sprintf(buffer + len, xstats[i].desc); + /* inner loop does each group */ + while (j < xstats[i].endpoint) { + val = 0; + /* sum over all cpus */ + for (c = 0; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + val += *(((__u32*)&per_cpu(xfsstats, c) + j)); + } + len += sprintf(buffer + len, " %u", val); + j++; + } + buffer[len++] = '\n'; + } + /* extra precision counters */ + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) continue; + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += sprintf(buffer + len, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +void +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + return; + create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL); +} + +void +xfs_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h new file mode 100644 index 00000000000..3f756a6c3eb --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +#include + +/* + * XFS global statistics + */ +struct xfsstats { +# define XFSSTAT_END_EXTENT_ALLOC 4 + __uint32_t xs_allocx; + __uint32_t xs_allocb; + __uint32_t xs_freex; + __uint32_t xs_freeb; +# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) + __uint32_t xs_abt_lookup; + __uint32_t xs_abt_compare; + __uint32_t xs_abt_insrec; + __uint32_t xs_abt_delrec; +# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) + __uint32_t xs_blk_mapr; + __uint32_t xs_blk_mapw; + __uint32_t xs_blk_unmap; + __uint32_t xs_add_exlist; + __uint32_t xs_del_exlist; + __uint32_t xs_look_exlist; + __uint32_t xs_cmp_exlist; +# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) + __uint32_t xs_bmbt_lookup; + __uint32_t xs_bmbt_compare; + __uint32_t xs_bmbt_insrec; + __uint32_t xs_bmbt_delrec; +# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) + __uint32_t xs_dir_lookup; + __uint32_t xs_dir_create; + __uint32_t xs_dir_remove; + __uint32_t xs_dir_getdents; +# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) + __uint32_t xs_trans_sync; + __uint32_t xs_trans_async; + __uint32_t xs_trans_empty; +# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) + __uint32_t xs_ig_attempts; + __uint32_t xs_ig_found; + __uint32_t xs_ig_frecycle; + __uint32_t xs_ig_missed; + __uint32_t xs_ig_dup; + __uint32_t xs_ig_reclaims; + __uint32_t xs_ig_attrchg; +# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) + __uint32_t xs_log_writes; + __uint32_t xs_log_blocks; + __uint32_t xs_log_noiclogs; + __uint32_t xs_log_force; + __uint32_t xs_log_force_sleep; +# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) + __uint32_t xs_try_logspace; + __uint32_t xs_sleep_logspace; + __uint32_t xs_push_ail; + __uint32_t xs_push_ail_success; + __uint32_t xs_push_ail_pushbuf; + __uint32_t xs_push_ail_pinned; + __uint32_t xs_push_ail_locked; + __uint32_t xs_push_ail_flushing; + __uint32_t xs_push_ail_restarts; + __uint32_t xs_push_ail_flush; +# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) + __uint32_t xs_xstrat_quick; + __uint32_t xs_xstrat_split; +# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) + __uint32_t xs_write_calls; + __uint32_t xs_read_calls; +# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) + __uint32_t xs_attr_get; + __uint32_t xs_attr_set; + __uint32_t xs_attr_remove; + __uint32_t xs_attr_list; +# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) + __uint32_t xs_iflush_count; + __uint32_t xs_icluster_flushcnt; + __uint32_t xs_icluster_flushinode; +# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) + __uint32_t vn_active; /* # vnodes not on free lists */ + __uint32_t vn_alloc; /* # times vn_alloc called */ + __uint32_t vn_get; /* # times vn_get called */ + __uint32_t vn_hold; /* # times vn_hold called */ + __uint32_t vn_rele; /* # times vn_rele called */ + __uint32_t vn_reclaim; /* # times vn_reclaim called */ + __uint32_t vn_remove; /* # times vn_remove called */ + __uint32_t vn_free; /* # times vn_free called */ +#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) + __uint32_t pb_get; + __uint32_t pb_create; + __uint32_t pb_get_locked; + __uint32_t pb_get_locked_waited; + __uint32_t pb_busy_locked; + __uint32_t pb_miss_locked; + __uint32_t pb_page_retries; + __uint32_t pb_page_found; + __uint32_t pb_get_read; +/* Extra precision counters */ + __uint64_t xs_xstrat_bytes; + __uint64_t xs_write_bytes; + __uint64_t xs_read_bytes; +}; + +DECLARE_PER_CPU(struct xfsstats, xfsstats); + +/* + * We don't disable preempt, not too worried about poking the + * wrong CPU's stat for now (also aggregated before reporting). + */ +#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) +#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) +#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) + +extern void xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); + + +#else /* !CONFIG_PROC_FS */ + +# define XFS_STATS_INC(count) +# define XFS_STATS_DEC(count) +# define XFS_STATS_ADD(count, inc) + +static __inline void xfs_init_procfs(void) { }; +static __inline void xfs_cleanup_procfs(void) { }; + +#endif /* !CONFIG_PROC_FS */ + +#endif /* __XFS_STATS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c new file mode 100644 index 00000000000..53dc658cafa --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_version.h" +#include "xfs_ioctl32.h" + +#include +#include +#include +#include + +STATIC struct quotactl_ops linvfs_qops; +STATIC struct super_operations linvfs_sops; +STATIC kmem_zone_t *linvfs_inode_zone; + +STATIC struct xfs_mount_args * +xfs_args_allocate( + struct super_block *sb) +{ + struct xfs_mount_args *args; + + args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP); + args->logbufs = args->logbufsize = -1; + strncpy(args->fsname, sb->s_id, MAXNAMELEN); + + /* Copy the already-parsed mount(2) flags we're interested in */ + if (sb->s_flags & MS_NOATIME) + args->flags |= XFSMNT_NOATIME; + if (sb->s_flags & MS_DIRSYNC) + args->flags |= XFSMNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + args->flags |= XFSMNT_WSYNC; + + /* Default to 32 bit inodes on Linux all the time */ + args->flags |= XFSMNT_32BITINODES; + + return args; +} + +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_prepare_write does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBD) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +STATIC __inline__ void +xfs_set_inodeops( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + if (vp->v_type == VNON) { + vn_mark_bad(vp); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &linvfs_file_inode_operations; + inode->i_fop = &linvfs_file_operations; + inode->i_mapping->a_ops = &linvfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &linvfs_dir_inode_operations; + inode->i_fop = &linvfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &linvfs_symlink_inode_operations; + if (inode->i_blocks) + inode->i_mapping->a_ops = &linvfs_aops; + } else { + inode->i_op = &linvfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } +} + +STATIC __inline__ void +xfs_revalidate_inode( + xfs_mount_t *mp, + vnode_t *vp, + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + inode->i_mode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type); + inode->i_nlink = ip->i_d.di_nlink; + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; + if (((1 << vp->v_type) & ((1<i_rdev = 0; + } else { + xfs_dev_t dev = ip->i_df.if_u2.if_rdev; + inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); + } + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + vp->v_flag &= ~VMODIFIED; +} + +void +xfs_initialize_vnode( + bhv_desc_t *bdp, + vnode_t *vp, + bhv_desc_t *inode_bhv, + int unlock) +{ + xfs_inode_t *ip = XFS_BHVTOI(inode_bhv); + struct inode *inode = LINVFS_GET_IP(vp); + + if (!inode_bhv->bd_vobj) { + vp->v_vfsp = bhvtovfs(bdp); + bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops); + bhv_insert(VN_BHV_HEAD(vp), inode_bhv); + } + + /* + * We need to set the ops vectors, and unlock the inode, but if + * we have been called during the new inode create process, it is + * too early to fill in the Linux inode. We will get called a + * second time once the inode is properly set up, and then we can + * finish our work. + */ + if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) { + vp->v_type = IFTOVT(ip->i_d.di_mode); + xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip); + xfs_set_inodeops(inode); + + ip->i_flags &= ~XFS_INEW; + barrier(); + + unlock_new_inode(inode); + } +} + +int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = open_bdev_excl(name, 0, mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + printk("XFS: Invalid device [%s], error=%d\n", name, error); + } + + return -error; +} + +void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + close_bdev_excl(bdev); +} + + +STATIC struct inode * +linvfs_alloc_inode( + struct super_block *sb) +{ + vnode_t *vp; + + vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone, + kmem_flags_convert(KM_SLEEP)); + if (!vp) + return NULL; + return LINVFS_GET_IP(vp); +} + +STATIC void +linvfs_destroy_inode( + struct inode *inode) +{ + kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode)); +} + +STATIC void +init_once( + void *data, + kmem_cache_t *cachep, + unsigned long flags) +{ + vnode_t *vp = (vnode_t *)data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(LINVFS_GET_IP(vp)); +} + +STATIC int +init_inodecache( void ) +{ + linvfs_inode_zone = kmem_cache_create("linvfs_icache", + sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (linvfs_inode_zone == NULL) + return -ENOMEM; + return 0; +} + +STATIC void +destroy_inodecache( void ) +{ + if (kmem_cache_destroy(linvfs_inode_zone)) + printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__); +} + +/* + * Attempt to flush the inode, this will actually fail + * if the inode is pinned, but we dirty the inode again + * at the point when it is unpinned after a log write, + * since this is when the inode itself becomes flushable. + */ +STATIC int +linvfs_write_inode( + struct inode *inode, + int sync) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0, flags = FLUSH_INODE; + + if (vp) { + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + if (sync) + flags |= FLUSH_SYNC; + VOP_IFLUSH(vp, flags, error); + if (error == EAGAIN) { + if (sync) + VOP_IFLUSH(vp, flags | FLUSH_LOG, error); + else + error = 0; + } + } + + return -error; +} + +STATIC void +linvfs_clear_inode( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + if (vp) { + vn_rele(vp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + /* + * Do all our cleanup, and remove this vnode. + */ + vn_remove(vp); + } +} + + +/* + * Enqueue a work item to be picked up by the vfs xfssyncd thread. + * Doing this has two advantages: + * - It saves on stack space, which is tight in certain situations + * - It can be used (with care) as a mechanism to avoid deadlocks. + * Flushing while allocating in a full filesystem requires both. + */ +STATIC void +xfs_syncd_queue_work( + struct vfs *vfs, + void *data, + void (*syncer)(vfs_t *, void *)) +{ + vfs_sync_work_t *work; + + work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP); + INIT_LIST_HEAD(&work->w_list); + work->w_syncer = syncer; + work->w_data = data; + work->w_vfs = vfs; + spin_lock(&vfs->vfs_sync_lock); + list_add_tail(&work->w_list, &vfs->vfs_sync_list); + spin_unlock(&vfs->vfs_sync_lock); + wake_up_process(vfs->vfs_sync_task); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room... + */ +STATIC void +xfs_flush_inode_work( + vfs_t *vfs, + void *inode) +{ + filemap_flush(((struct inode *)inode)->i_mapping); + iput((struct inode *)inode); +} + +void +xfs_flush_inode( + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); + struct vfs *vfs = XFS_MTOVFS(ip->i_mount); + + igrab(inode); + xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work); + delay(HZ/2); +} + +/* + * This is the "bigger hammer" version of xfs_flush_inode_work... + * (IOW, "If at first you don't succeed, use a Bigger Hammer"). + */ +STATIC void +xfs_flush_device_work( + vfs_t *vfs, + void *inode) +{ + sync_blockdev(vfs->vfs_super->s_bdev); + iput((struct inode *)inode); +} + +void +xfs_flush_device( + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); + struct vfs *vfs = XFS_MTOVFS(ip->i_mount); + + igrab(inode); + xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work); + delay(HZ/2); + xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); +} + +#define SYNCD_FLAGS (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR) +STATIC void +vfs_sync_worker( + vfs_t *vfsp, + void *unused) +{ + int error; + + if (!(vfsp->vfs_flag & VFS_RDONLY)) + VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error); + vfsp->vfs_sync_seq++; + wmb(); + wake_up(&vfsp->vfs_wait_single_sync_task); +} + +STATIC int +xfssyncd( + void *arg) +{ + long timeleft; + vfs_t *vfsp = (vfs_t *) arg; + struct list_head tmp; + struct vfs_sync_work *work, *n; + + daemonize("xfssyncd"); + + vfsp->vfs_sync_work.w_vfs = vfsp; + vfsp->vfs_sync_work.w_syncer = vfs_sync_worker; + vfsp->vfs_sync_task = current; + wmb(); + wake_up(&vfsp->vfs_wait_sync_task); + + INIT_LIST_HEAD(&tmp); + timeleft = (xfs_syncd_centisecs * HZ) / 100; + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + timeleft = schedule_timeout(timeleft); + /* swsusp */ + try_to_freeze(PF_FREEZE); + if (vfsp->vfs_flag & VFS_UMOUNT) + break; + + spin_lock(&vfsp->vfs_sync_lock); + /* + * We can get woken by laptop mode, to do a sync - + * that's the (only!) case where the list would be + * empty with time remaining. + */ + if (!timeleft || list_empty(&vfsp->vfs_sync_list)) { + if (!timeleft) + timeleft = (xfs_syncd_centisecs * HZ) / 100; + INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list); + list_add_tail(&vfsp->vfs_sync_work.w_list, + &vfsp->vfs_sync_list); + } + list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list) + list_move(&work->w_list, &tmp); + spin_unlock(&vfsp->vfs_sync_lock); + + list_for_each_entry_safe(work, n, &tmp, w_list) { + (*work->w_syncer)(vfsp, work->w_data); + list_del(&work->w_list); + if (work == &vfsp->vfs_sync_work) + continue; + kmem_free(work, sizeof(struct vfs_sync_work)); + } + } + + vfsp->vfs_sync_task = NULL; + wmb(); + wake_up(&vfsp->vfs_wait_sync_task); + + return 0; +} + +STATIC int +linvfs_start_syncd( + vfs_t *vfsp) +{ + int pid; + + pid = kernel_thread(xfssyncd, (void *) vfsp, + CLONE_VM | CLONE_FS | CLONE_FILES); + if (pid < 0) + return -pid; + wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task); + return 0; +} + +STATIC void +linvfs_stop_syncd( + vfs_t *vfsp) +{ + vfsp->vfs_flag |= VFS_UMOUNT; + wmb(); + + wake_up_process(vfsp->vfs_sync_task); + wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task); +} + +STATIC void +linvfs_put_super( + struct super_block *sb) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + linvfs_stop_syncd(vfsp); + VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error); + if (!error) + VFS_UNMOUNT(vfsp, 0, NULL, error); + if (error) { + printk("XFS unmount got error %d\n", error); + printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp); + return; + } + + vfs_deallocate(vfsp); +} + +STATIC void +linvfs_write_super( + struct super_block *sb) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + if (sb->s_flags & MS_RDONLY) { + sb->s_dirt = 0; /* paranoia */ + return; + } + /* Push the log and superblock a little */ + VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error); + sb->s_dirt = 0; +} + +STATIC int +linvfs_sync_super( + struct super_block *sb, + int wait) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + int flags = SYNC_FSDATA; + + if (wait) + flags |= SYNC_WAIT; + + VFS_SYNC(vfsp, flags, NULL, error); + sb->s_dirt = 0; + + if (unlikely(laptop_mode)) { + int prev_sync_seq = vfsp->vfs_sync_seq; + + /* + * The disk must be active because we're syncing. + * We schedule xfssyncd now (now that the disk is + * active) instead of later (when it might not be). + */ + wake_up_process(vfsp->vfs_sync_task); + /* + * We have to wait for the sync iteration to complete. + * If we don't, the disk activity caused by the sync + * will come after the sync is completed, and that + * triggers another sync from laptop mode. + */ + wait_event(vfsp->vfs_wait_single_sync_task, + vfsp->vfs_sync_seq != prev_sync_seq); + } + + return -error; +} + +STATIC int +linvfs_statfs( + struct super_block *sb, + struct kstatfs *statp) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_STATVFS(vfsp, statp, NULL, error); + return -error; +} + +STATIC int +linvfs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb); + int error; + + VFS_PARSEARGS(vfsp, options, args, 1, error); + if (!error) + VFS_MNTUPDATE(vfsp, flags, args, error); + kmem_free(args, sizeof(*args)); + return -error; +} + +STATIC void +linvfs_freeze_fs( + struct super_block *sb) +{ + VFS_FREEZE(LINVFS_GET_VFS(sb)); +} + +STATIC int +linvfs_show_options( + struct seq_file *m, + struct vfsmount *mnt) +{ + struct vfs *vfsp = LINVFS_GET_VFS(mnt->mnt_sb); + int error; + + VFS_SHOWARGS(vfsp, m, error); + return error; +} + +STATIC int +linvfs_getxstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error); + return -error; +} + +STATIC int +linvfs_setxstate( + struct super_block *sb, + unsigned int flags, + int op) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error); + return -error; +} + +STATIC int +linvfs_getxquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error, getmode; + + getmode = (type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETQUOTA; + VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error); + return -error; +} + +STATIC int +linvfs_setxquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error, setmode; + + setmode = (type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETQLIM; + VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error); + return -error; +} + +STATIC int +linvfs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + vnode_t *rootvp; + struct vfs *vfsp = vfs_allocate(); + struct xfs_mount_args *args = xfs_args_allocate(sb); + struct kstatfs statvfs; + int error, error2; + + vfsp->vfs_super = sb; + LINVFS_SET_VFS(sb, vfsp); + if (sb->s_flags & MS_RDONLY) + vfsp->vfs_flag |= VFS_RDONLY; + bhv_insert_all_vfsops(vfsp); + + VFS_PARSEARGS(vfsp, (char *)data, args, 0, error); + if (error) { + bhv_remove_all_vfsops(vfsp, 1); + goto fail_vfsop; + } + + sb_min_blocksize(sb, BBSIZE); +#ifdef CONFIG_XFS_EXPORT + sb->s_export_op = &linvfs_export_ops; +#endif + sb->s_qcop = &linvfs_qops; + sb->s_op = &linvfs_sops; + + VFS_MOUNT(vfsp, args, NULL, error); + if (error) { + bhv_remove_all_vfsops(vfsp, 1); + goto fail_vfsop; + } + + VFS_STATVFS(vfsp, &statvfs, NULL, error); + if (error) + goto fail_unmount; + + sb->s_dirt = 1; + sb->s_magic = statvfs.f_type; + sb->s_blocksize = statvfs.f_bsize; + sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + VFS_ROOT(vfsp, &rootvp, error); + if (error) + goto fail_unmount; + + sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp)); + if (!sb->s_root) { + error = ENOMEM; + goto fail_vnrele; + } + if (is_bad_inode(sb->s_root->d_inode)) { + error = EINVAL; + goto fail_vnrele; + } + if ((error = linvfs_start_syncd(vfsp))) + goto fail_vnrele; + vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address); + + kmem_free(args, sizeof(*args)); + return 0; + +fail_vnrele: + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + } else { + VN_RELE(rootvp); + } + +fail_unmount: + VFS_UNMOUNT(vfsp, 0, NULL, error2); + +fail_vfsop: + vfs_deallocate(vfsp); + kmem_free(args, sizeof(*args)); + return -error; +} + +STATIC struct super_block * +linvfs_get_sb( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super); +} + +STATIC struct super_operations linvfs_sops = { + .alloc_inode = linvfs_alloc_inode, + .destroy_inode = linvfs_destroy_inode, + .write_inode = linvfs_write_inode, + .clear_inode = linvfs_clear_inode, + .put_super = linvfs_put_super, + .write_super = linvfs_write_super, + .sync_fs = linvfs_sync_super, + .write_super_lockfs = linvfs_freeze_fs, + .statfs = linvfs_statfs, + .remount_fs = linvfs_remount, + .show_options = linvfs_show_options, +}; + +STATIC struct quotactl_ops linvfs_qops = { + .get_xstate = linvfs_getxstate, + .set_xstate = linvfs_setxstate, + .get_xquota = linvfs_getxquota, + .set_xquota = linvfs_setxquota, +}; + +STATIC struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .get_sb = linvfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + + +STATIC int __init +init_xfs_fs( void ) +{ + int error; + struct sysinfo si; + static char message[] __initdata = KERN_INFO \ + XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; + + printk(message); + + si_meminfo(&si); + xfs_physmem = si.totalram; + + ktrace_init(64); + + error = init_inodecache(); + if (error < 0) + goto undo_inodecache; + + error = pagebuf_init(); + if (error < 0) + goto undo_pagebuf; + + vn_init(); + xfs_init(); + uuid_init(); + vfs_initquota(); + + error = register_filesystem(&xfs_fs_type); + if (error) + goto undo_register; + XFS_DM_INIT(&xfs_fs_type); + return 0; + +undo_register: + pagebuf_terminate(); + +undo_pagebuf: + destroy_inodecache(); + +undo_inodecache: + return error; +} + +STATIC void __exit +exit_xfs_fs( void ) +{ + vfs_exitquota(); + XFS_DM_EXIT(&xfs_fs_type); + unregister_filesystem(&xfs_fs_type); + xfs_cleanup(); + pagebuf_terminate(); + destroy_inodecache(); + ktrace_uninit(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h new file mode 100644 index 00000000000..ec7e0035c73 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#ifdef CONFIG_XFS_DMAPI +# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) +# define vfs_initdmapi() dmapi_init() +# define vfs_exitdmapi() dmapi_uninit() +#else +# define vfs_insertdmapi(vfs) do { } while (0) +# define vfs_initdmapi() do { } while (0) +# define vfs_exitdmapi() do { } while (0) +#endif + +#ifdef CONFIG_XFS_QUOTA +# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops) +extern void xfs_qm_init(void); +extern void xfs_qm_exit(void); +# define vfs_initquota() xfs_qm_init() +# define vfs_exitquota() xfs_qm_exit() +#else +# define vfs_insertquota(vfs) do { } while (0) +# define vfs_initquota() do { } while (0) +# define vfs_exitquota() do { } while (0) +#endif + +#ifdef CONFIG_XFS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define XFS_ACL_STRING +# define set_posix_acl_flag(sb) do { } while (0) +#endif + +#ifdef CONFIG_XFS_SECURITY +# define XFS_SECURITY_STRING "security attributes, " +# define ENOSECURITY 0 +#else +# define XFS_SECURITY_STRING +# define ENOSECURITY EOPNOTSUPP +#endif + +#ifdef CONFIG_XFS_RT +# define XFS_REALTIME_STRING "realtime, " +#else +# define XFS_REALTIME_STRING +#endif + +#if XFS_BIG_BLKNOS +# if XFS_BIG_INUMS +# define XFS_BIGFS_STRING "large block/inode numbers, " +# else +# define XFS_BIGFS_STRING "large block numbers, " +# endif +#else +# define XFS_BIGFS_STRING +#endif + +#ifdef CONFIG_XFS_TRACE +# define XFS_TRACE_STRING "tracing, " +#else +# define XFS_TRACE_STRING +#endif + +#ifdef CONFIG_XFS_DMAPI +# define XFS_DMAPI_STRING "dmapi support, " +#else +# define XFS_DMAPI_STRING +#endif + +#ifdef DEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ + XFS_REALTIME_STRING \ + XFS_BIGFS_STRING \ + XFS_TRACE_STRING \ + XFS_DMAPI_STRING \ + XFS_DBG_STRING /* DBG must be last */ + +#define LINVFS_GET_VFS(s) \ + (vfs_t *)((s)->s_fs_info) +#define LINVFS_SET_VFS(s, vfsp) \ + ((s)->s_fs_info = vfsp) + +struct xfs_inode; +struct xfs_mount; +struct xfs_buftarg; +struct block_device; + +extern __uint64_t xfs_max_file_offset(unsigned int); + +extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int); + +extern void xfs_flush_inode(struct xfs_inode *); +extern void xfs_flush_device(struct xfs_inode *); + +extern int xfs_blkdev_get(struct xfs_mount *, const char *, + struct block_device **); +extern void xfs_blkdev_put(struct block_device *); + +extern struct export_operations linvfs_export_ops; + +#endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c new file mode 100644 index 00000000000..0dc010356f4 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_rw.h" +#include +#include + + +static struct ctl_table_header *xfs_table_header; + + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + ctl_table *ctl, + int write, + struct file *filp, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); + + if (!ret && write && *valp) { + printk("XFS Clearing xfsstats\n"); + for (c = 0; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} +#endif /* CONFIG_PROC_FS */ + +STATIC ctl_table xfs_table[] = { + {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max}, + + {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max}, + + {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max}, + + {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.panic_mask.min, &xfs_params.panic_mask.max}, + + {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.error_level.min, &xfs_params.error_level.max}, + + {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max}, + + {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max}, + + {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max}, + + {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max}, + + {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max}, + + {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max}, + + {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max}, + + {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.rotorstep.min, &xfs_params.rotorstep.max}, + + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, + sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler, + &sysctl_intvec, NULL, + &xfs_params.stats_clear.min, &xfs_params.stats_clear.max}, +#endif /* CONFIG_PROC_FS */ + + {0} +}; + +STATIC ctl_table xfs_dir_table[] = { + {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, + {0} +}; + +STATIC ctl_table xfs_root_table[] = { + {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, + {0} +}; + +void +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table, 1); +} + +void +xfs_sysctl_unregister(void) +{ + if (xfs_table_header) + unregister_sysctl_table(xfs_table_header); +} diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h new file mode 100644 index 00000000000..a39a95020a5 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include + +/* + * Tunable xfs parameters + */ + +typedef struct xfs_sysctl_val { + int min; + int val; + int max; +} xfs_sysctl_val_t; + +typedef struct xfs_param { + xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/ + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is + * not a member of parent dir GID. */ + xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ + xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ + xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ + xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ + xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ + xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ + xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ + xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ + xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ + xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ + xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ + xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ +} xfs_param_t; + +/* + * xfs_error_level: + * + * How much error reporting will be done when internal problems are + * encountered. These problems normally return an EFSCORRUPTED to their + * caller, with no other information reported. + * + * 0 No error reports + * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown + * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any + * additional errors that are known to not cause shutdowns) + * + * xfs_panic_mask bit 0x8 turns the error reports into panics + */ + +enum { + /* XFS_REFCACHE_SIZE = 1 */ + /* XFS_REFCACHE_PURGE = 2 */ + XFS_RESTRICT_CHOWN = 3, + XFS_SGID_INHERIT = 4, + XFS_SYMLINK_MODE = 5, + XFS_PANIC_MASK = 6, + XFS_ERRLEVEL = 7, + XFS_SYNCD_TIMER = 8, + /* XFS_PROBE_DMAPI = 9 */ + /* XFS_PROBE_IOOPS = 10 */ + /* XFS_PROBE_QUOTA = 11 */ + XFS_STATS_CLEAR = 12, + XFS_INHERIT_SYNC = 13, + XFS_INHERIT_NODUMP = 14, + XFS_INHERIT_NOATIME = 15, + XFS_BUF_TIMER = 16, + XFS_BUF_AGE = 17, + /* XFS_IO_BYPASS = 18 */ + XFS_INHERIT_NOSYM = 19, + XFS_ROTORSTEP = 20, +}; + +extern xfs_param_t xfs_params; + +#ifdef CONFIG_SYSCTL +extern void xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +# define xfs_sysctl_register() do { } while (0) +# define xfs_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +#endif /* __XFS_SYSCTL_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h new file mode 100644 index 00000000000..96f96394417 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_version.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Dummy file that can contain a timestamp to put into the + * XFS init string, to help users keep track of what they're + * running + */ + +#ifndef __XFS_VERSION_H__ +#define __XFS_VERSION_H__ + +#define XFS_VERSION_STRING "SGI XFS" + +#endif /* __XFS_VERSION_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c new file mode 100644 index 00000000000..669c6164495 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vfs.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_macros.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_imap.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_quota.h" + +int +vfs_mount( + struct bhv_desc *bdp, + struct xfs_mount_args *args, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_mount) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr)); +} + +int +vfs_parseargs( + struct bhv_desc *bdp, + char *s, + struct xfs_mount_args *args, + int f) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_parseargs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f)); +} + +int +vfs_showargs( + struct bhv_desc *bdp, + struct seq_file *m) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_showargs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_showargs)(next, m)); +} + +int +vfs_unmount( + struct bhv_desc *bdp, + int fl, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_unmount) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr)); +} + +int +vfs_mntupdate( + struct bhv_desc *bdp, + int *fl, + struct xfs_mount_args *args) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_mntupdate) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args)); +} + +int +vfs_root( + struct bhv_desc *bdp, + struct vnode **vpp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_root) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_root)(next, vpp)); +} + +int +vfs_statvfs( + struct bhv_desc *bdp, + xfs_statfs_t *sp, + struct vnode *vp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_statvfs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp)); +} + +int +vfs_sync( + struct bhv_desc *bdp, + int fl, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_sync) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr)); +} + +int +vfs_vget( + struct bhv_desc *bdp, + struct vnode **vpp, + struct fid *fidp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_vget) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp)); +} + +int +vfs_dmapiops( + struct bhv_desc *bdp, + caddr_t addr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_dmapiops) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr)); +} + +int +vfs_quotactl( + struct bhv_desc *bdp, + int cmd, + int id, + caddr_t addr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_quotactl) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr)); +} + +void +vfs_init_vnode( + struct bhv_desc *bdp, + struct vnode *vp, + struct bhv_desc *bp, + int unlock) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_init_vnode) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock)); +} + +void +vfs_force_shutdown( + struct bhv_desc *bdp, + int fl, + char *file, + int line) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_force_shutdown) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line)); +} + +void +vfs_freeze( + struct bhv_desc *bdp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_freeze) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_freeze)(next)); +} + +vfs_t * +vfs_allocate( void ) +{ + struct vfs *vfsp; + + vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP); + bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); + INIT_LIST_HEAD(&vfsp->vfs_sync_list); + spin_lock_init(&vfsp->vfs_sync_lock); + init_waitqueue_head(&vfsp->vfs_wait_sync_task); + init_waitqueue_head(&vfsp->vfs_wait_single_sync_task); + return vfsp; +} + +void +vfs_deallocate( + struct vfs *vfsp) +{ + bhv_head_destroy(VFS_BHVHEAD(vfsp)); + kmem_free(vfsp, sizeof(vfs_t)); +} + +void +vfs_insertops( + struct vfs *vfsp, + struct bhv_vfsops *vfsops) +{ + struct bhv_desc *bdp; + + bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP); + bhv_desc_init(bdp, NULL, vfsp, vfsops); + bhv_insert(&vfsp->vfs_bh, bdp); +} + +void +vfs_insertbhv( + struct vfs *vfsp, + struct bhv_desc *bdp, + struct vfsops *vfsops, + void *mount) +{ + bhv_desc_init(bdp, mount, vfsp, vfsops); + bhv_insert_initial(&vfsp->vfs_bh, bdp); +} + +void +bhv_remove_vfsops( + struct vfs *vfsp, + int pos) +{ + struct bhv_desc *bhv; + + bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos); + if (!bhv) + return; + bhv_remove(&vfsp->vfs_bh, bhv); + kmem_free(bhv, sizeof(*bhv)); +} + +void +bhv_remove_all_vfsops( + struct vfs *vfsp, + int freebase) +{ + struct xfs_mount *mp; + + bhv_remove_vfsops(vfsp, VFS_POSITION_QM); + bhv_remove_vfsops(vfsp, VFS_POSITION_DM); + if (!freebase) + return; + mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops)); + VFS_REMOVEBHV(vfsp, &mp->m_bhv); + xfs_mount_free(mp, 0); +} + +void +bhv_insert_all_vfsops( + struct vfs *vfsp) +{ + struct xfs_mount *mp; + + mp = xfs_mount_init(); + vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp); + vfs_insertdmapi(vfsp); + vfs_insertquota(vfsp); +} diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h new file mode 100644 index 00000000000..76493991578 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_VFS_H__ +#define __XFS_VFS_H__ + +#include +#include "xfs_fs.h" + +struct fid; +struct vfs; +struct cred; +struct vnode; +struct kstatfs; +struct seq_file; +struct super_block; +struct xfs_mount_args; + +typedef struct kstatfs xfs_statfs_t; + +typedef struct vfs_sync_work { + struct list_head w_list; + struct vfs *w_vfs; + void *w_data; /* syncer routine argument */ + void (*w_syncer)(struct vfs *, void *); +} vfs_sync_work_t; + +typedef struct vfs { + u_int vfs_flag; /* flags */ + xfs_fsid_t vfs_fsid; /* file system ID */ + xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ + bhv_head_t vfs_bh; /* head of vfs behavior chain */ + struct super_block *vfs_super; /* generic superblock pointer */ + struct task_struct *vfs_sync_task; /* generalised sync thread */ + vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */ + struct list_head vfs_sync_list; /* sync thread work item list */ + spinlock_t vfs_sync_lock; /* work item list lock */ + int vfs_sync_seq; /* sync thread generation no. */ + wait_queue_head_t vfs_wait_single_sync_task; + wait_queue_head_t vfs_wait_sync_task; +} vfs_t; + +#define vfs_fbhv vfs_bh.bh_first /* 1st on vfs behavior chain */ + +#define bhvtovfs(bdp) ( (struct vfs *)BHV_VOBJ(bdp) ) +#define bhvtovfsops(bdp) ( (struct vfsops *)BHV_OPS(bdp) ) +#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh ) +#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) ) + +#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ +#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */ +#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ + +typedef enum { + VFS_BHV_UNKNOWN, /* not specified */ + VFS_BHV_XFS, /* xfs */ + VFS_BHV_DM, /* data migration */ + VFS_BHV_QM, /* quota manager */ + VFS_BHV_IO, /* IO path */ + VFS_BHV_END /* housekeeping end-of-range */ +} vfs_bhv_t; + +#define VFS_POSITION_XFS (BHV_POSITION_BASE) +#define VFS_POSITION_DM (VFS_POSITION_BASE+10) +#define VFS_POSITION_QM (VFS_POSITION_BASE+20) +#define VFS_POSITION_IO (VFS_POSITION_BASE+30) + +#define VFS_RDONLY 0x0001 /* read-only vfs */ +#define VFS_GRPID 0x0002 /* group-ID assigned from directory */ +#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ +#define VFS_UMOUNT 0x0008 /* unmount in progress */ +#define VFS_END 0x0008 /* max flag */ + +#define SYNC_ATTR 0x0001 /* sync attributes */ +#define SYNC_CLOSE 0x0002 /* close file system down */ +#define SYNC_DELWRI 0x0004 /* look at delayed writes */ +#define SYNC_WAIT 0x0008 /* wait for i/o to complete */ +#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */ +#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ +#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ +#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ + +typedef int (*vfs_mount_t)(bhv_desc_t *, + struct xfs_mount_args *, struct cred *); +typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *, + struct xfs_mount_args *, int); +typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *); +typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *, + struct xfs_mount_args *); +typedef int (*vfs_root_t)(bhv_desc_t *, struct vnode **); +typedef int (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *); +typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *); +typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t); +typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t); +typedef void (*vfs_init_vnode_t)(bhv_desc_t *, + struct vnode *, bhv_desc_t *, int); +typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int); +typedef void (*vfs_freeze_t)(bhv_desc_t *); + +typedef struct vfsops { + bhv_position_t vf_position; /* behavior chain position */ + vfs_mount_t vfs_mount; /* mount file system */ + vfs_parseargs_t vfs_parseargs; /* parse mount options */ + vfs_showargs_t vfs_showargs; /* unparse mount options */ + vfs_unmount_t vfs_unmount; /* unmount file system */ + vfs_mntupdate_t vfs_mntupdate; /* update file system options */ + vfs_root_t vfs_root; /* get root vnode */ + vfs_statvfs_t vfs_statvfs; /* file system statistics */ + vfs_sync_t vfs_sync; /* flush files */ + vfs_vget_t vfs_vget; /* get vnode from fid */ + vfs_dmapiops_t vfs_dmapiops; /* data migration */ + vfs_quotactl_t vfs_quotactl; /* disk quota */ + vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */ + vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */ + vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */ +} vfsops_t; + +/* + * VFS's. Operates on vfs structure pointers (starts at bhv head). + */ +#define VHEAD(v) ((v)->vfs_fbhv) +#define VFS_MOUNT(v, ma,cr, rv) ((rv) = vfs_mount(VHEAD(v), ma,cr)) +#define VFS_PARSEARGS(v, o,ma,f, rv) ((rv) = vfs_parseargs(VHEAD(v), o,ma,f)) +#define VFS_SHOWARGS(v, m, rv) ((rv) = vfs_showargs(VHEAD(v), m)) +#define VFS_UNMOUNT(v, f, cr, rv) ((rv) = vfs_unmount(VHEAD(v), f,cr)) +#define VFS_MNTUPDATE(v, fl, args, rv) ((rv) = vfs_mntupdate(VHEAD(v), fl, args)) +#define VFS_ROOT(v, vpp, rv) ((rv) = vfs_root(VHEAD(v), vpp)) +#define VFS_STATVFS(v, sp,vp, rv) ((rv) = vfs_statvfs(VHEAD(v), sp,vp)) +#define VFS_SYNC(v, flag,cr, rv) ((rv) = vfs_sync(VHEAD(v), flag,cr)) +#define VFS_VGET(v, vpp,fidp, rv) ((rv) = vfs_vget(VHEAD(v), vpp,fidp)) +#define VFS_DMAPIOPS(v, p, rv) ((rv) = vfs_dmapiops(VHEAD(v), p)) +#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p)) +#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) ) +#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) ) +#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) ) + +/* + * PVFS's. Operates on behavior descriptor pointers. + */ +#define PVFS_MOUNT(b, ma,cr, rv) ((rv) = vfs_mount(b, ma,cr)) +#define PVFS_PARSEARGS(b, o,ma,f, rv) ((rv) = vfs_parseargs(b, o,ma,f)) +#define PVFS_SHOWARGS(b, m, rv) ((rv) = vfs_showargs(b, m)) +#define PVFS_UNMOUNT(b, f,cr, rv) ((rv) = vfs_unmount(b, f,cr)) +#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args)) +#define PVFS_ROOT(b, vpp, rv) ((rv) = vfs_root(b, vpp)) +#define PVFS_STATVFS(b, sp,vp, rv) ((rv) = vfs_statvfs(b, sp,vp)) +#define PVFS_SYNC(b, flag,cr, rv) ((rv) = vfs_sync(b, flag,cr)) +#define PVFS_VGET(b, vpp,fidp, rv) ((rv) = vfs_vget(b, vpp,fidp)) +#define PVFS_DMAPIOPS(b, p, rv) ((rv) = vfs_dmapiops(b, p)) +#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p)) +#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) ) +#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) ) +#define PVFS_FREEZE(b) ( vfs_freeze(b) ) + +extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *); +extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int); +extern int vfs_showargs(bhv_desc_t *, struct seq_file *); +extern int vfs_unmount(bhv_desc_t *, int, struct cred *); +extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *); +extern int vfs_root(bhv_desc_t *, struct vnode **); +extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *); +extern int vfs_sync(bhv_desc_t *, int, struct cred *); +extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *); +extern int vfs_dmapiops(bhv_desc_t *, caddr_t); +extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t); +extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int); +extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int); +extern void vfs_freeze(bhv_desc_t *); + +typedef struct bhv_vfsops { + struct vfsops bhv_common; + void * bhv_custom; +} bhv_vfsops_t; + +#define vfs_bhv_lookup(v, id) ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) ) +#define vfs_bhv_custom(b) ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom ) +#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o)) +#define vfs_bhv_clr_custom(b) ( (b)->bhv_custom = NULL ) + +extern vfs_t *vfs_allocate(void); +extern void vfs_deallocate(vfs_t *); +extern void vfs_insertops(vfs_t *, bhv_vfsops_t *); +extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *); + +extern void bhv_insert_all_vfsops(struct vfs *); +extern void bhv_remove_all_vfsops(struct vfs *, int); +extern void bhv_remove_vfsops(struct vfs *, int); + +#define fs_frozen(vfsp) ((vfsp)->vfs_super->s_frozen) +#define fs_check_frozen(vfsp, level) \ + vfs_check_frozen(vfsp->vfs_super, level); + +#endif /* __XFS_VFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c new file mode 100644 index 00000000000..849c61c74f3 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + + +uint64_t vn_generation; /* vnode generation number */ +DEFINE_SPINLOCK(vnumber_lock); + +/* + * Dedicated vnode inactive/reclaim sync semaphores. + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) +sv_t vsync[NVSYNC]; + +/* + * Translate stat(2) file types to vnode types and vice versa. + * Aware of numeric order of S_IFMT and vnode type values. + */ +enum vtype iftovt_tab[] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON +}; + +u_short vttoif_tab[] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK +}; + + +void +vn_init(void) +{ + register sv_t *svp; + register int i; + + for (svp = vsync, i = 0; i < NVSYNC; i++, svp++) + init_sv(svp, SV_DEFAULT, "vsy", i); +} + +/* + * Clean a vnode of filesystem-specific data and prepare it for reuse. + */ +STATIC int +vn_reclaim( + struct vnode *vp) +{ + int error; + + XFS_STATS_INC(vn_reclaim); + vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address); + + /* + * Only make the VOP_RECLAIM call if there are behaviors + * to call. + */ + if (vp->v_fbhv) { + VOP_RECLAIM(vp, error); + if (error) + return -error; + } + ASSERT(vp->v_fbhv == NULL); + + VN_LOCK(vp); + vp->v_flag &= (VRECLM|VWAIT); + VN_UNLOCK(vp, 0); + + vp->v_type = VNON; + vp->v_fbhv = NULL; + +#ifdef XFS_VNODE_TRACE + ktrace_free(vp->v_trace); + vp->v_trace = NULL; +#endif + + return 0; +} + +STATIC void +vn_wakeup( + struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & VWAIT) + sv_broadcast(vptosync(vp)); + vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED); + VN_UNLOCK(vp, 0); +} + +int +vn_wait( + struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & (VINACT | VRECLM)) { + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + return 1; + } + VN_UNLOCK(vp, 0); + return 0; +} + +struct vnode * +vn_initialize( + struct inode *inode) +{ + struct vnode *vp = LINVFS_GET_VP(inode); + + XFS_STATS_INC(vn_active); + XFS_STATS_INC(vn_alloc); + + vp->v_flag = VMODIFIED; + spinlock_init(&vp->v_lock, "v_lock"); + + spin_lock(&vnumber_lock); + if (!++vn_generation) /* v_number shouldn't be zero */ + vn_generation++; + vp->v_number = vn_generation; + spin_unlock(&vnumber_lock); + + ASSERT(VN_CACHED(vp) == 0); + + /* Initialize the first behavior and the behavior chain head. */ + vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); + +#ifdef XFS_VNODE_TRACE + vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); + printk("Allocated VNODE_TRACE at 0x%p\n", vp->v_trace); +#endif /* XFS_VNODE_TRACE */ + + vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address); + return vp; +} + +/* + * Get a reference on a vnode. + */ +vnode_t * +vn_get( + struct vnode *vp, + vmap_t *vmap) +{ + struct inode *inode; + + XFS_STATS_INC(vn_get); + inode = LINVFS_GET_IP(vp); + if (inode->i_state & I_FREEING) + return NULL; + + inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino); + if (!inode) /* Inode not present */ + return NULL; + + vn_trace_exit(vp, "vn_get", (inst_t *)__return_address); + + return vp; +} + +/* + * Revalidate the Linux inode from the vattr. + * Note: i_size _not_ updated; we must hold the inode + * semaphore when doing that - callers responsibility. + */ +void +vn_revalidate_core( + struct vnode *vp, + vattr_t *vap) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + inode->i_mode = VTTOIF(vap->va_type) | vap->va_mode; + inode->i_nlink = vap->va_nlink; + inode->i_uid = vap->va_uid; + inode->i_gid = vap->va_gid; + inode->i_blocks = vap->va_nblocks; + inode->i_mtime = vap->va_mtime; + inode->i_ctime = vap->va_ctime; + inode->i_atime = vap->va_atime; + if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (vap->va_xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (vap->va_xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (vap->va_xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Revalidate the Linux inode from the vnode. + */ +int +vn_revalidate( + struct vnode *vp) +{ + vattr_t va; + int error; + + vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address); + ASSERT(vp->v_fbhv != NULL); + + va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (!error) { + vn_revalidate_core(vp, &va); + VUNMODIFY(vp); + } + return -error; +} + +/* + * purge a vnode from the cache + * At this point the vnode is guaranteed to have no references (vn_count == 0) + * The caller has to make sure that there are no ways someone could + * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock). + */ +void +vn_purge( + struct vnode *vp, + vmap_t *vmap) +{ + vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address); + +again: + /* + * Check whether vp has already been reclaimed since our caller + * sampled its version while holding a filesystem cache lock that + * its VOP_RECLAIM function acquires. + */ + VN_LOCK(vp); + if (vp->v_number != vmap->v_number) { + VN_UNLOCK(vp, 0); + return; + } + + /* + * If vp is being reclaimed or inactivated, wait until it is inert, + * then proceed. Can't assume that vnode is actually reclaimed + * just because the reclaimed flag is asserted -- a vn_alloc + * reclaim can fail. + */ + if (vp->v_flag & (VINACT | VRECLM)) { + ASSERT(vn_count(vp) == 0); + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + goto again; + } + + /* + * Another process could have raced in and gotten this vnode... + */ + if (vn_count(vp) > 0) { + VN_UNLOCK(vp, 0); + return; + } + + XFS_STATS_DEC(vn_active); + vp->v_flag |= VRECLM; + VN_UNLOCK(vp, 0); + + /* + * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells + * vp's filesystem to flush and invalidate all cached resources. + * When vn_reclaim returns, vp should have no private data, + * either in a system cache or attached to v_data. + */ + if (vn_reclaim(vp) != 0) + panic("vn_purge: cannot reclaim"); + + /* + * Wakeup anyone waiting for vp to be reclaimed. + */ + vn_wakeup(vp); +} + +/* + * Add a reference to a referenced vnode. + */ +struct vnode * +vn_hold( + struct vnode *vp) +{ + struct inode *inode; + + XFS_STATS_INC(vn_hold); + + VN_LOCK(vp); + inode = igrab(LINVFS_GET_IP(vp)); + ASSERT(inode); + VN_UNLOCK(vp, 0); + + return vp; +} + +/* + * Call VOP_INACTIVE on last reference. + */ +void +vn_rele( + struct vnode *vp) +{ + int vcnt; + int cache; + + XFS_STATS_INC(vn_rele); + + VN_LOCK(vp); + + vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address); + vcnt = vn_count(vp); + + /* + * Since we always get called from put_inode we know + * that i_count won't be decremented after we + * return. + */ + if (!vcnt) { + /* + * As soon as we turn this on, noone can find us in vn_get + * until we turn off VINACT or VRECLM + */ + vp->v_flag |= VINACT; + VN_UNLOCK(vp, 0); + + /* + * Do not make the VOP_INACTIVE call if there + * are no behaviors attached to the vnode to call. + */ + if (vp->v_fbhv) + VOP_INACTIVE(vp, NULL, cache); + + VN_LOCK(vp); + if (vp->v_flag & VWAIT) + sv_broadcast(vptosync(vp)); + + vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED); + } + + VN_UNLOCK(vp, 0); + + vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address); +} + +/* + * Finish the removal of a vnode. + */ +void +vn_remove( + struct vnode *vp) +{ + vmap_t vmap; + + /* Make sure we don't do this to the same vnode twice */ + if (!(vp->v_fbhv)) + return; + + XFS_STATS_INC(vn_remove); + vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address); + + /* + * After the following purge the vnode + * will no longer exist. + */ + VMAP(vp, vmap); + vn_purge(vp, &vmap); +} + + +#ifdef XFS_VNODE_TRACE + +#define KTRACE_ENTER(vp, vk, s, line, ra) \ + ktrace_enter( (vp)->v_trace, \ +/* 0 */ (void *)(__psint_t)(vk), \ +/* 1 */ (void *)(s), \ +/* 2 */ (void *)(__psint_t) line, \ +/* 3 */ (void *)(vn_count(vp)), \ +/* 4 */ (void *)(ra), \ +/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ +/* 6 */ (void *)(__psint_t)current_cpu(), \ +/* 7 */ (void *)(__psint_t)current_pid(), \ +/* 8 */ (void *)__return_address, \ +/* 9 */ 0, 0, 0, 0, 0, 0, 0) + +/* + * Vnode tracing code. + */ +void +vn_trace_entry(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); +} + +void +vn_trace_exit(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); +} + +void +vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); +} + +void +vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); +} + +void +vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); +} +#endif /* XFS_VNODE_TRACE */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h new file mode 100644 index 00000000000..da76c1f1e11 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + * Portions Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef __XFS_VNODE_H__ +#define __XFS_VNODE_H__ + +struct uio; +struct file; +struct vattr; +struct xfs_iomap; +struct attrlist_cursor_kern; + +/* + * Vnode types. VNON means no type. + */ +enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK }; + +typedef xfs_ino_t vnumber_t; +typedef struct dentry vname_t; +typedef bhv_head_t vn_bhv_head_t; + +/* + * MP locking protocols: + * v_flag, v_vfsp VN_LOCK/VN_UNLOCK + * v_type read-only or fs-dependent + */ +typedef struct vnode { + __u32 v_flag; /* vnode flags (see below) */ + enum vtype v_type; /* vnode type */ + struct vfs *v_vfsp; /* ptr to containing VFS */ + vnumber_t v_number; /* in-core vnode number */ + vn_bhv_head_t v_bh; /* behavior head */ + spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */ + struct inode v_inode; /* Linux inode */ +#ifdef XFS_VNODE_TRACE + struct ktrace *v_trace; /* trace header structure */ +#endif +} vnode_t; + +#define v_fbhv v_bh.bh_first /* first behavior */ +#define v_fops v_bh.bh_first->bd_ops /* first behavior ops */ + +#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ +#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */ +#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ + +typedef enum { + VN_BHV_UNKNOWN, /* not specified */ + VN_BHV_XFS, /* xfs */ + VN_BHV_DM, /* data migration */ + VN_BHV_QM, /* quota manager */ + VN_BHV_IO, /* IO path */ + VN_BHV_END /* housekeeping end-of-range */ +} vn_bhv_t; + +#define VNODE_POSITION_XFS (VNODE_POSITION_BASE) +#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10) +#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20) +#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30) + +/* + * Macros for dealing with the behavior descriptor inside of the vnode. + */ +#define BHV_TO_VNODE(bdp) ((vnode_t *)BHV_VOBJ(bdp)) +#define BHV_TO_VNODE_NULL(bdp) ((vnode_t *)BHV_VOBJNULL(bdp)) + +#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh))) +#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) +#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) +#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) +#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) + +/* + * Vnode to Linux inode mapping. + */ +#define LINVFS_GET_VP(inode) ((vnode_t *)list_entry(inode, vnode_t, v_inode)) +#define LINVFS_GET_IP(vp) (&(vp)->v_inode) + +/* + * Convert between vnode types and inode formats (since POSIX.1 + * defines mode word of stat structure in terms of inode formats). + */ +extern enum vtype iftovt_tab[]; +extern u_short vttoif_tab[]; +#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + + +/* + * Vnode flags. + */ +#define VINACT 0x1 /* vnode is being inactivated */ +#define VRECLM 0x2 /* vnode is being reclaimed */ +#define VWAIT 0x4 /* waiting for VINACT/VRECLM to end */ +#define VMODIFIED 0x8 /* XFS inode state possibly differs */ + /* to the Linux inode state. */ + +/* + * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter. + */ +typedef enum vrwlock { + VRWLOCK_NONE, + VRWLOCK_READ, + VRWLOCK_WRITE, + VRWLOCK_WRITE_DIRECT, + VRWLOCK_TRY_READ, + VRWLOCK_TRY_WRITE +} vrwlock_t; + +/* + * Return values for VOP_INACTIVE. A return value of + * VN_INACTIVE_NOCACHE implies that the file system behavior + * has disassociated its state and bhv_desc_t from the vnode. + */ +#define VN_INACTIVE_CACHE 0 +#define VN_INACTIVE_NOCACHE 1 + +/* + * Values for the cmd code given to VOP_VNODE_CHANGE. + */ +typedef enum vchange { + VCHANGE_FLAGS_FRLOCKS = 0, + VCHANGE_FLAGS_ENF_LOCKING = 1, + VCHANGE_FLAGS_TRUNCATED = 2, + VCHANGE_FLAGS_PAGE_DIRTY = 3, + VCHANGE_FLAGS_IOEXCL_COUNT = 4 +} vchange_t; + + +typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); +typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, + loff_t *, int, size_t, read_actor_t, + void *, struct cred *); +typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, + int, unsigned int, void __user *); +typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **, + int, vnode_t *, struct cred *); +typedef int (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *); +typedef int (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *, + struct cred *); +typedef int (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *, + struct cred *); +typedef int (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *); +typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *, + int *); +typedef int (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *, + char *, vnode_t **, struct cred *); +typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int, + struct cred *); +typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, + xfs_off_t, xfs_off_t); +typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); +typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *); +typedef int (*vop_release_t)(bhv_desc_t *); +typedef int (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t); +typedef void (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t); +typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, + struct xfs_iomap *, int *); +typedef int (*vop_reclaim_t)(bhv_desc_t *); +typedef int (*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int, + struct cred *); +typedef int (*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int, + struct cred *); +typedef int (*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *); +typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int, + struct attrlist_cursor_kern *, struct cred *); +typedef void (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int); +typedef void (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t); +typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, + uint64_t, int); +typedef int (*vop_iflush_t)(bhv_desc_t *, int); + + +typedef struct vnodeops { + bhv_position_t vn_position; /* position within behavior chain */ + vop_open_t vop_open; + vop_read_t vop_read; + vop_write_t vop_write; + vop_sendfile_t vop_sendfile; + vop_ioctl_t vop_ioctl; + vop_getattr_t vop_getattr; + vop_setattr_t vop_setattr; + vop_access_t vop_access; + vop_lookup_t vop_lookup; + vop_create_t vop_create; + vop_remove_t vop_remove; + vop_link_t vop_link; + vop_rename_t vop_rename; + vop_mkdir_t vop_mkdir; + vop_rmdir_t vop_rmdir; + vop_readdir_t vop_readdir; + vop_symlink_t vop_symlink; + vop_readlink_t vop_readlink; + vop_fsync_t vop_fsync; + vop_inactive_t vop_inactive; + vop_fid2_t vop_fid2; + vop_rwlock_t vop_rwlock; + vop_rwunlock_t vop_rwunlock; + vop_bmap_t vop_bmap; + vop_reclaim_t vop_reclaim; + vop_attr_get_t vop_attr_get; + vop_attr_set_t vop_attr_set; + vop_attr_remove_t vop_attr_remove; + vop_attr_list_t vop_attr_list; + vop_link_removed_t vop_link_removed; + vop_vnode_change_t vop_vnode_change; + vop_ptossvp_t vop_tosspages; + vop_pflushinvalvp_t vop_flushinval_pages; + vop_pflushvp_t vop_flush_pages; + vop_release_t vop_release; + vop_iflush_t vop_iflush; +} vnodeops_t; + +/* + * VOP's. + */ +#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op) + +#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ + rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) +#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ + rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) +#define VOP_OPEN(vp, cr, rv) \ + rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr) +#define VOP_GETATTR(vp, vap, f, cr, rv) \ + rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr) +#define VOP_SETATTR(vp, vap, f, cr, rv) \ + rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr) +#define VOP_ACCESS(vp, mode, cr, rv) \ + rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr) +#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv) \ + rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr) +#define VOP_CREATE(dvp,d,vap,vpp,cr,rv) \ + rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr) +#define VOP_REMOVE(dvp,d,cr,rv) \ + rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr) +#define VOP_LINK(tdvp,fvp,d,cr,rv) \ + rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr) +#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv) \ + rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr) +#define VOP_MKDIR(dp,d,vap,vpp,cr,rv) \ + rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr) +#define VOP_RMDIR(dp,d,cr,rv) \ + rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr) +#define VOP_READDIR(vp,uiop,cr,eofp,rv) \ + rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp) +#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \ + rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr) +#define VOP_READLINK(vp,uiop,fl,cr,rv) \ + rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr) +#define VOP_FSYNC(vp,f,cr,b,e,rv) \ + rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e) +#define VOP_INACTIVE(vp, cr, rv) \ + rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr) +#define VOP_RELEASE(vp, rv) \ + rv = _VOP_(vop_release, vp)((vp)->v_fbhv) +#define VOP_FID2(vp, fidp, rv) \ + rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp) +#define VOP_RWLOCK(vp,i) \ + (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i) +#define VOP_RWLOCK_TRY(vp,i) \ + _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i) +#define VOP_RWUNLOCK(vp,i) \ + (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i) +#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv) \ + rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr) +#define VOP_RECLAIM(vp, rv) \ + rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv) +#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv) \ + rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred) +#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv) \ + rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred) +#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv) \ + rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred) +#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv) \ + rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred) +#define VOP_LINK_REMOVED(vp, dvp, linkzero) \ + (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero) +#define VOP_VNODE_CHANGE(vp, cmd, val) \ + (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val) +/* + * These are page cache functions that now go thru VOPs. + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_TOSS_PAGES(vp, first, last, fiopt) \ + _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt) +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt) \ + _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt) +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \ + rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt) +#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv) \ + rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg) +#define VOP_IFLUSH(vp, flags, rv) \ + rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags) + +/* + * Flags for read/write calls - same values as IRIX + */ +#define IO_ISAIO 0x00001 /* don't wait for completion */ +#define IO_ISDIRECT 0x00004 /* bypass page cache */ +#define IO_INVIS 0x00020 /* don't update inode timestamps */ + +/* + * Flags for VOP_IFLUSH call + */ +#define FLUSH_SYNC 1 /* wait for flush to complete */ +#define FLUSH_INODE 2 /* flush the inode itself */ +#define FLUSH_LOG 4 /* force the last log entry for + * this inode out to disk */ + +/* + * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and + * VOP_FLUSH_PAGES. + */ +#define FI_NONE 0 /* none */ +#define FI_REMAPF 1 /* Do a remapf prior to the operation */ +#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. + Prevent VM access to the pages until + the operation completes. */ + +/* + * Vnode attributes. va_mask indicates those attributes the caller + * wants to set or extract. + */ +typedef struct vattr { + int va_mask; /* bit-mask of attributes present */ + enum vtype va_type; /* vnode type (for create) */ + mode_t va_mode; /* file access mode and type */ + nlink_t va_nlink; /* number of references to file */ + uid_t va_uid; /* owner user id */ + gid_t va_gid; /* owner group id */ + xfs_ino_t va_nodeid; /* file id */ + xfs_off_t va_size; /* file size in bytes */ + u_long va_blocksize; /* blocksize preferred for i/o */ + struct timespec va_atime; /* time of last access */ + struct timespec va_mtime; /* time of last modification */ + struct timespec va_ctime; /* time file changed */ + u_int va_gen; /* generation number of file */ + xfs_dev_t va_rdev; /* device the special file represents */ + __int64_t va_nblocks; /* number of blocks allocated */ + u_long va_xflags; /* random extended file flags */ + u_long va_extsize; /* file extent size */ + u_long va_nextents; /* number of extents in file */ + u_long va_anextents; /* number of attr extents in file */ + int va_projid; /* project id */ +} vattr_t; + +/* + * setattr or getattr attributes + */ +#define XFS_AT_TYPE 0x00000001 +#define XFS_AT_MODE 0x00000002 +#define XFS_AT_UID 0x00000004 +#define XFS_AT_GID 0x00000008 +#define XFS_AT_FSID 0x00000010 +#define XFS_AT_NODEID 0x00000020 +#define XFS_AT_NLINK 0x00000040 +#define XFS_AT_SIZE 0x00000080 +#define XFS_AT_ATIME 0x00000100 +#define XFS_AT_MTIME 0x00000200 +#define XFS_AT_CTIME 0x00000400 +#define XFS_AT_RDEV 0x00000800 +#define XFS_AT_BLKSIZE 0x00001000 +#define XFS_AT_NBLOCKS 0x00002000 +#define XFS_AT_VCODE 0x00004000 +#define XFS_AT_MAC 0x00008000 +#define XFS_AT_UPDATIME 0x00010000 +#define XFS_AT_UPDMTIME 0x00020000 +#define XFS_AT_UPDCTIME 0x00040000 +#define XFS_AT_ACL 0x00080000 +#define XFS_AT_CAP 0x00100000 +#define XFS_AT_INF 0x00200000 +#define XFS_AT_XFLAGS 0x00400000 +#define XFS_AT_EXTSIZE 0x00800000 +#define XFS_AT_NEXTENTS 0x01000000 +#define XFS_AT_ANEXTENTS 0x02000000 +#define XFS_AT_PROJID 0x04000000 +#define XFS_AT_SIZE_NOPERM 0x08000000 +#define XFS_AT_GENCOUNT 0x10000000 + +#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ + XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ + XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ + XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\ + XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\ + XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT) + +#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ + XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ + XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ + XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID) + +#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME) + +#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME) + +#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\ + XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ + XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) + +/* + * Modes. + */ +#define VSUID S_ISUID /* set user id on execution */ +#define VSGID S_ISGID /* set group id on execution */ +#define VSVTX S_ISVTX /* save swapped text even after use */ +#define VREAD S_IRUSR /* read, write, execute permissions */ +#define VWRITE S_IWUSR +#define VEXEC S_IXUSR + +#define MODEMASK S_IALLUGO /* mode bits plus permission bits */ + +/* + * Check whether mandatory file locking is enabled. + */ +#define MANDLOCK(vp, mode) \ + ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) + +extern void vn_init(void); +extern int vn_wait(struct vnode *); +extern vnode_t *vn_initialize(struct inode *); + +/* + * Acquiring and invalidating vnodes: + * + * if (vn_get(vp, version, 0)) + * ...; + * vn_purge(vp, version); + * + * vn_get and vn_purge must be called with vmap_t arguments, sampled + * while a lock that the vnode's VOP_RECLAIM function acquires is + * held, to ensure that the vnode sampled with the lock held isn't + * recycled (VOP_RECLAIMed) or deallocated between the release of the lock + * and the subsequent vn_get or vn_purge. + */ + +/* + * vnode_map structures _must_ match vn_epoch and vnode structure sizes. + */ +typedef struct vnode_map { + vfs_t *v_vfsp; + vnumber_t v_number; /* in-core vnode number */ + xfs_ino_t v_ino; /* inode # */ +} vmap_t; + +#define VMAP(vp, vmap) {(vmap).v_vfsp = (vp)->v_vfsp, \ + (vmap).v_number = (vp)->v_number, \ + (vmap).v_ino = (vp)->v_inode.i_ino; } + +extern void vn_purge(struct vnode *, vmap_t *); +extern vnode_t *vn_get(struct vnode *, vmap_t *); +extern int vn_revalidate(struct vnode *); +extern void vn_revalidate_core(struct vnode *, vattr_t *); +extern void vn_remove(struct vnode *); + +static inline int vn_count(struct vnode *vp) +{ + return atomic_read(&LINVFS_GET_IP(vp)->i_count); +} + +/* + * Vnode reference counting functions (and macros for compatibility). + */ +extern vnode_t *vn_hold(struct vnode *); +extern void vn_rele(struct vnode *); + +#if defined(XFS_VNODE_TRACE) +#define VN_HOLD(vp) \ + ((void)vn_hold(vp), \ + vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address)) +#define VN_RELE(vp) \ + (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \ + iput(LINVFS_GET_IP(vp))) +#else +#define VN_HOLD(vp) ((void)vn_hold(vp)) +#define VN_RELE(vp) (iput(LINVFS_GET_IP(vp))) +#endif + +/* + * Vname handling macros. + */ +#define VNAME(dentry) ((char *) (dentry)->d_name.name) +#define VNAMELEN(dentry) ((dentry)->d_name.len) +#define VNAME_TO_VNODE(dentry) (LINVFS_GET_VP((dentry)->d_inode)) + +/* + * Vnode spinlock manipulation. + */ +#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock) +#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s) +#define VN_FLAGSET(vp,b) vn_flagset(vp,b) +#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b) + +static __inline__ void vn_flagset(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag |= flag; + spin_unlock(&vp->v_lock); +} + +static __inline__ void vn_flagclr(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag &= ~flag; + spin_unlock(&vp->v_lock); +} + +/* + * Update modify/access/change times on the vnode + */ +#define VN_MTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_mtime = *(tvp)) +#define VN_ATIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_atime = *(tvp)) +#define VN_CTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_ctime = *(tvp)) + +/* + * Dealing with bad inodes + */ +static inline void vn_mark_bad(struct vnode *vp) +{ + make_bad_inode(LINVFS_GET_IP(vp)); +} + +static inline int VN_BAD(struct vnode *vp) +{ + return is_bad_inode(LINVFS_GET_IP(vp)); +} + +/* + * Some useful predicates. + */ +#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping) +#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ + PAGECACHE_TAG_DIRTY) +#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED) +#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_DMI 0x08 /* invocation from a DMI function */ +#define ATTR_LAZY 0x80 /* set/get attributes lazily */ +#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ + +/* + * Flags to VOP_FSYNC and VOP_RECLAIM. + */ +#define FSYNC_NOWAIT 0 /* asynchronous flush */ +#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ +#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ +#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ + +/* + * Tracking vnode activity. + */ +#if defined(XFS_VNODE_TRACE) + +#define VNODE_TRACE_SIZE 16 /* number of trace entries */ +#define VNODE_KTRACE_ENTRY 1 +#define VNODE_KTRACE_EXIT 2 +#define VNODE_KTRACE_HOLD 3 +#define VNODE_KTRACE_REF 4 +#define VNODE_KTRACE_RELE 5 + +extern void vn_trace_entry(struct vnode *, char *, inst_t *); +extern void vn_trace_exit(struct vnode *, char *, inst_t *); +extern void vn_trace_hold(struct vnode *, char *, int, inst_t *); +extern void vn_trace_ref(struct vnode *, char *, int, inst_t *); +extern void vn_trace_rele(struct vnode *, char *, int, inst_t *); + +#define VN_TRACE(vp) \ + vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address) +#else +#define vn_trace_entry(a,b,c) +#define vn_trace_exit(a,b,c) +#define vn_trace_hold(a,b,c,d) +#define vn_trace_ref(a,b,c,d) +#define vn_trace_rele(a,b,c,d) +#define VN_TRACE(vp) +#endif + +#endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c new file mode 100644 index 00000000000..740d20d3318 --- /dev/null +++ b/fs/xfs/quota/xfs_dquot.c @@ -0,0 +1,1648 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_trans_priv.h" + +#include "xfs_qm.h" + + +/* + LOCK ORDER + + inode lock (ilock) + dquot hash-chain lock (hashlock) + xqm dquot freelist lock (freelistlock + mount's dquot list lock (mplistlock) + user dquot lock - lock ordering among dquots is based on the uid or gid + group dquot lock - similar to udquots. Between the two dquots, the udquot + has to be locked first. + pin lock - the dquot lock must be held to take this lock. + flush lock - ditto. +*/ + +STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *); + +#ifdef DEBUG +xfs_buftarg_t *xfs_dqerror_target; +int xfs_do_dqerror; +int xfs_dqreq_num; +int xfs_dqerror_mod = 33; +#endif + +/* + * Allocate and initialize a dquot. We don't always allocate fresh memory; + * we try to reclaim a free dquot if the number of incore dquots are above + * a threshold. + * The only field inside the core that gets initialized at this point + * is the d_id field. The idea is to fill in the entire q_core + * when we read in the on disk dquot. + */ +xfs_dquot_t * +xfs_qm_dqinit( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type) +{ + xfs_dquot_t *dqp; + boolean_t brandnewdquot; + + brandnewdquot = xfs_qm_dqalloc_incore(&dqp); + dqp->dq_flags = type; + INT_SET(dqp->q_core.d_id, ARCH_CONVERT, id); + dqp->q_mount = mp; + + /* + * No need to re-initialize these if this is a reclaimed dquot. + */ + if (brandnewdquot) { + dqp->dq_flnext = dqp->dq_flprev = dqp; + mutex_init(&dqp->q_qlock, MUTEX_DEFAULT, "xdq"); + initnsema(&dqp->q_flock, 1, "fdq"); + sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); + +#ifdef XFS_DQUOT_TRACE + dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP); + xfs_dqtrace_entry(dqp, "DQINIT"); +#endif + } else { + /* + * Only the q_core portion was zeroed in dqreclaim_one(). + * So, we need to reset others. + */ + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + dqp->MPL_NEXT = dqp->HL_NEXT = NULL; + dqp->HL_PREVP = dqp->MPL_PREVP = NULL; + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + dqp->q_pincount = 0; + dqp->q_hash = NULL; + ASSERT(dqp->dq_flnext == dqp->dq_flprev); + +#ifdef XFS_DQUOT_TRACE + ASSERT(dqp->q_trace); + xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); +#endif + } + + /* + * log item gets initialized later + */ + return (dqp); +} + +/* + * This is called to free all the memory associated with a dquot + */ +void +xfs_qm_dqdestroy( + xfs_dquot_t *dqp) +{ + ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); + + mutex_destroy(&dqp->q_qlock); + freesema(&dqp->q_flock); + sv_destroy(&dqp->q_pinwait); + +#ifdef XFS_DQUOT_TRACE + if (dqp->q_trace) + ktrace_free(dqp->q_trace); + dqp->q_trace = NULL; +#endif + kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); + atomic_dec(&xfs_Gqm->qm_totaldquots); +} + +/* + * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. + */ +STATIC void +xfs_qm_dqinit_core( + xfs_dqid_t id, + uint type, + xfs_dqblk_t *d) +{ + /* + * Caller has zero'd the entire dquot 'chunk' already. + */ + INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC); + INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION); + INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id); + INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type); +} + + +#ifdef XFS_DQUOT_TRACE +/* + * Dquot tracing for debugging. + */ +/* ARGSUSED */ +void +__xfs_dqtrace_entry( + xfs_dquot_t *dqp, + char *func, + void *retaddr, + xfs_inode_t *ip) +{ + xfs_dquot_t *udqp = NULL; + xfs_ino_t ino = 0; + + ASSERT(dqp->q_trace); + if (ip) { + ino = ip->i_ino; + udqp = ip->i_udquot; + } + ktrace_enter(dqp->q_trace, + (void *)(__psint_t)DQUOT_KTRACE_ENTRY, + (void *)func, + (void *)(__psint_t)dqp->q_nrefs, + (void *)(__psint_t)dqp->dq_flags, + (void *)(__psint_t)dqp->q_res_bcount, + (void *)(__psint_t)INT_GET(dqp->q_core.d_bcount, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_icount, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_hardlimit, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_softlimit, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_hardlimit, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_softlimit, + ARCH_CONVERT), + (void *)(__psint_t)INT_GET(dqp->q_core.d_id, ARCH_CONVERT), + (void *)(__psint_t)current_pid(), + (void *)(__psint_t)ino, + (void *)(__psint_t)retaddr, + (void *)(__psint_t)udqp); + return; +} +#endif + + +/* + * If default limits are in force, push them into the dquot now. + * We overwrite the dquot limits only if they are zero and this + * is not the root dquot. + */ +void +xfs_qm_adjust_dqlimits( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + xfs_quotainfo_t *q = mp->m_quotainfo; + + ASSERT(d->d_id); + + if (q->qi_bsoftlimit && !d->d_blk_softlimit) + INT_SET(d->d_blk_softlimit, ARCH_CONVERT, q->qi_bsoftlimit); + if (q->qi_bhardlimit && !d->d_blk_hardlimit) + INT_SET(d->d_blk_hardlimit, ARCH_CONVERT, q->qi_bhardlimit); + if (q->qi_isoftlimit && !d->d_ino_softlimit) + INT_SET(d->d_ino_softlimit, ARCH_CONVERT, q->qi_isoftlimit); + if (q->qi_ihardlimit && !d->d_ino_hardlimit) + INT_SET(d->d_ino_hardlimit, ARCH_CONVERT, q->qi_ihardlimit); + if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) + INT_SET(d->d_rtb_softlimit, ARCH_CONVERT, q->qi_rtbsoftlimit); + if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) + INT_SET(d->d_rtb_hardlimit, ARCH_CONVERT, q->qi_rtbhardlimit); +} + +/* + * Check the limits and timers of a dquot and start or reset timers + * if necessary. + * This gets called even when quota enforcement is OFF, which makes our + * life a little less complicated. (We just don't reject any quota + * reservations in that case, when enforcement is off). + * We also return 0 as the values of the timers in Q_GETQUOTA calls, when + * enforcement's off. + * In contrast, warnings are a little different in that they don't + * 'automatically' get started when limits get exceeded. + */ +void +xfs_qm_adjust_dqtimers( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + ASSERT(d->d_id); + +#ifdef QUOTADEBUG + if (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)) + ASSERT(INT_GET(d->d_blk_softlimit, ARCH_CONVERT) <= + INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)); + if (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)) + ASSERT(INT_GET(d->d_ino_softlimit, ARCH_CONVERT) <= + INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)); + if (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)) + ASSERT(INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) <= + INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)); +#endif + if (!d->d_btimer) { + if ((INT_GET(d->d_blk_softlimit, ARCH_CONVERT) && + (INT_GET(d->d_bcount, ARCH_CONVERT) >= + INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) || + (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT) && + (INT_GET(d->d_bcount, ARCH_CONVERT) >= + INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) { + INT_SET(d->d_btimer, ARCH_CONVERT, + get_seconds() + XFS_QI_BTIMELIMIT(mp)); + } + } else { + if ((!d->d_blk_softlimit || + (INT_GET(d->d_bcount, ARCH_CONVERT) < + INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) && + (!d->d_blk_hardlimit || + (INT_GET(d->d_bcount, ARCH_CONVERT) < + INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) { + d->d_btimer = 0; + } + } + + if (!d->d_itimer) { + if ((INT_GET(d->d_ino_softlimit, ARCH_CONVERT) && + (INT_GET(d->d_icount, ARCH_CONVERT) >= + INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) || + (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT) && + (INT_GET(d->d_icount, ARCH_CONVERT) >= + INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) { + INT_SET(d->d_itimer, ARCH_CONVERT, + get_seconds() + XFS_QI_ITIMELIMIT(mp)); + } + } else { + if ((!d->d_ino_softlimit || + (INT_GET(d->d_icount, ARCH_CONVERT) < + INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) && + (!d->d_ino_hardlimit || + (INT_GET(d->d_icount, ARCH_CONVERT) < + INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) { + d->d_itimer = 0; + } + } + + if (!d->d_rtbtimer) { + if ((INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) && + (INT_GET(d->d_rtbcount, ARCH_CONVERT) >= + INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) || + (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT) && + (INT_GET(d->d_rtbcount, ARCH_CONVERT) >= + INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) { + INT_SET(d->d_rtbtimer, ARCH_CONVERT, + get_seconds() + XFS_QI_RTBTIMELIMIT(mp)); + } + } else { + if ((!d->d_rtb_softlimit || + (INT_GET(d->d_rtbcount, ARCH_CONVERT) < + INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) && + (!d->d_rtb_hardlimit || + (INT_GET(d->d_rtbcount, ARCH_CONVERT) < + INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) { + d->d_rtbtimer = 0; + } + } +} + +/* + * Increment or reset warnings of a given dquot. + */ +int +xfs_qm_dqwarn( + xfs_disk_dquot_t *d, + uint flags) +{ + int warned; + + /* + * root's limits are not real limits. + */ + if (!d->d_id) + return (0); + + warned = 0; + if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) && + (INT_GET(d->d_bcount, ARCH_CONVERT) >= + INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) { + if (flags & XFS_QMOPT_DOWARN) { + INT_MOD(d->d_bwarns, ARCH_CONVERT, +1); + warned++; + } + } else { + if (!d->d_blk_softlimit || + (INT_GET(d->d_bcount, ARCH_CONVERT) < + INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) { + d->d_bwarns = 0; + } + } + + if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 && + (INT_GET(d->d_icount, ARCH_CONVERT) >= + INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) { + if (flags & XFS_QMOPT_DOWARN) { + INT_MOD(d->d_iwarns, ARCH_CONVERT, +1); + warned++; + } + } else { + if (!d->d_ino_softlimit || + (INT_GET(d->d_icount, ARCH_CONVERT) < + INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) { + d->d_iwarns = 0; + } + } +#ifdef QUOTADEBUG + if (INT_GET(d->d_iwarns, ARCH_CONVERT)) + cmn_err(CE_DEBUG, + "--------@@Inode warnings running : %Lu >= %Lu", + INT_GET(d->d_icount, ARCH_CONVERT), + INT_GET(d->d_ino_softlimit, ARCH_CONVERT)); + if (INT_GET(d->d_bwarns, ARCH_CONVERT)) + cmn_err(CE_DEBUG, + "--------@@Blks warnings running : %Lu >= %Lu", + INT_GET(d->d_bcount, ARCH_CONVERT), + INT_GET(d->d_blk_softlimit, ARCH_CONVERT)); +#endif + return (warned); +} + + +/* + * initialize a buffer full of dquots and log the whole thing + */ +STATIC void +xfs_qm_init_dquot_blk( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_buf_t *bp) +{ + xfs_dqblk_t *d; + int curid, i; + + ASSERT(tp); + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); + + /* + * ID of the first dquot in the block - id's are zero based. + */ + curid = id - (id % XFS_QM_DQPERBLK(mp)); + ASSERT(curid >= 0); + memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); + for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) + xfs_qm_dqinit_core(curid, type, d); + xfs_trans_dquot_buf(tp, bp, + type & XFS_DQ_USER ? + XFS_BLI_UDQUOT_BUF : + XFS_BLI_GDQUOT_BUF); + xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); +} + + + +/* + * Allocate a block and fill it with dquots. + * This is called when the bmapi finds a hole. + */ +STATIC int +xfs_qm_dqalloc( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + xfs_inode_t *quotip, + xfs_fileoff_t offset_fsb, + xfs_buf_t **O_bpp) +{ + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + xfs_bmbt_irec_t map; + int nmaps, error, committed; + xfs_buf_t *bp; + + ASSERT(tp != NULL); + xfs_dqtrace_entry(dqp, "DQALLOC"); + + /* + * Initialize the bmap freelist prior to calling bmapi code. + */ + XFS_BMAP_INIT(&flist, &firstblock); + xfs_ilock(quotip, XFS_ILOCK_EXCL); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return (ESRCH); + } + + /* + * xfs_trans_commit normally decrements the vnode ref count + * when it unlocks the inode. Since we want to keep the quota + * inode around, we bump the vnode ref count now. + */ + VN_HOLD(XFS_ITOV(quotip)); + + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + nmaps = 1; + if ((error = xfs_bmapi(tp, quotip, + offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, + XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, + &firstblock, + XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist))) { + goto error0; + } + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + /* + * Keep track of the blkno to save a lookup later + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + /* now we can just get the buffer (there's nothing to read yet) */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + dqp->q_blkno, + XFS_QI_DQCHUNKLEN(mp), + 0); + if (!bp || (error = XFS_BUF_GETERROR(bp))) + goto error1; + /* + * Make a chunk of dquots out of this buffer and log + * the entire thing. + */ + xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT), + dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP), + bp); + + if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) { + goto error1; + } + + *O_bpp = bp; + return 0; + + error1: + xfs_bmap_cancel(&flist); + error0: + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + + return (error); +} + +/* + * Maps a dquot to the buffer containing its on-disk version. + * This returns a ptr to the buffer containing the on-disk dquot + * in the bpp param, and a ptr to the on-disk dquot within that buffer + */ +STATIC int +xfs_qm_dqtobp( + xfs_trans_t *tp, + xfs_dquot_t *dqp, + xfs_disk_dquot_t **O_ddpp, + xfs_buf_t **O_bpp, + uint flags) +{ + xfs_bmbt_irec_t map; + int nmaps, error; + xfs_buf_t *bp; + xfs_inode_t *quotip; + xfs_mount_t *mp; + xfs_disk_dquot_t *ddq; + xfs_dqid_t id; + boolean_t newdquot; + + mp = dqp->q_mount; + id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT); + nmaps = 1; + newdquot = B_FALSE; + + /* + * If we don't know where the dquot lives, find out. + */ + if (dqp->q_blkno == (xfs_daddr_t) 0) { + /* We use the id as an index */ + dqp->q_fileoffset = (xfs_fileoff_t) ((uint)id / + XFS_QM_DQPERBLK(mp)); + nmaps = 1; + quotip = XFS_DQ_TO_QIP(dqp); + xfs_ilock(quotip, XFS_ILOCK_SHARED); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (XFS_IS_THIS_QUOTA_OFF(dqp)) { + xfs_iunlock(quotip, XFS_ILOCK_SHARED); + return (ESRCH); + } + /* + * Find the block map; no allocations yet + */ + error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, + XFS_BMAPI_METADATA, + NULL, 0, &map, &nmaps, NULL); + + xfs_iunlock(quotip, XFS_ILOCK_SHARED); + if (error) + return (error); + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * + sizeof(xfs_dqblk_t); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return (ENOENT); + + ASSERT(tp); + if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip, + dqp->q_fileoffset, &bp))) + return (error); + newdquot = B_TRUE; + } else { + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + } + } + ASSERT(dqp->q_blkno != DELAYSTARTBLOCK); + ASSERT(dqp->q_blkno != HOLESTARTBLOCK); + + /* + * Read in the buffer, unless we've just done the allocation + * (in which case we already have the buf). + */ + if (! newdquot) { + xfs_dqtrace_entry(dqp, "DQTOBP READBUF"); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + XFS_QI_DQCHUNKLEN(mp), + 0, &bp))) { + return (error); + } + if (error || !bp) + return XFS_ERROR(error); + } + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + /* + * calculate the location of the dquot inside the buffer. + */ + ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); + + /* + * A simple sanity check in case we got a corrupted dquot... + */ + if (xfs_qm_dqcheck(ddq, id, + dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP), + flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), + "dqtobp")) { + if (!(flags & XFS_QMOPT_DQREPAIR)) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EIO); + } + XFS_BUF_BUSY(bp); /* We dirtied this */ + } + + *O_bpp = bp; + *O_ddpp = ddq; + + return (0); +} + + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. + * + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqread( + xfs_trans_t *tp, + xfs_dqid_t id, + xfs_dquot_t *dqp, /* dquot to get filled in */ + uint flags) +{ + xfs_disk_dquot_t *ddqp; + xfs_buf_t *bp; + int error; + + /* + * get a pointer to the on-disk dquot and the buffer containing it + * dqp already knows its own type (GROUP/USER). + */ + xfs_dqtrace_entry(dqp, "DQREAD"); + if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) { + return (error); + } + + /* copy everything from disk dquot to the incore dquot */ + memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id); + xfs_qm_dquot_logitem_init(dqp); + + /* + * Reservation counters are defined as reservation plus current usage + * to avoid having to add everytime. + */ + dqp->q_res_bcount = INT_GET(ddqp->d_bcount, ARCH_CONVERT); + dqp->q_res_icount = INT_GET(ddqp->d_icount, ARCH_CONVERT); + dqp->q_res_rtbcount = INT_GET(ddqp->d_rtbcount, ARCH_CONVERT); + + /* Mark the buf so that this will stay incore a little longer */ + XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); + + /* + * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) + * So we need to release with xfs_trans_brelse(). + * The strategy here is identical to that of inodes; we lock + * the dquot in xfs_qm_dqget() before making it accessible to + * others. This is because dquots, like inodes, need a good level of + * concurrency, and we don't want to take locks on the entire buffers + * for dquot accesses. + * Note also that the dquot buffer may even be dirty at this point, if + * this particular dquot was repaired. We still aren't afraid to + * brelse it because we have the changes incore. + */ + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + xfs_trans_brelse(tp, bp); + + return (error); +} + + +/* + * allocate an incore dquot from the kernel heap, + * and fill its core with quota information kept on disk. + * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk + * if it wasn't already allocated. + */ +STATIC int +xfs_qm_idtodq( + xfs_mount_t *mp, + xfs_dqid_t id, /* gid or uid, depending on type */ + uint type, /* UDQUOT or GDQUOT */ + uint flags, /* DQALLOC, DQREPAIR */ + xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ +{ + xfs_dquot_t *dqp; + int error; + xfs_trans_t *tp; + int cancelflags=0; + + dqp = xfs_qm_dqinit(mp, id, type); + tp = NULL; + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + if ((error = xfs_trans_reserve(tp, + XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT))) { + cancelflags = 0; + goto error0; + } + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + + /* + * Read it from disk; xfs_dqread() takes care of + * all the necessary initialization of dquot's fields (locks, etc) + */ + if ((error = xfs_qm_dqread(tp, id, dqp, flags))) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + xfs_dqtrace_entry(dqp, "DQREAD FAIL"); + cancelflags |= XFS_TRANS_ABORT; + goto error0; + } + if (tp) { + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, + NULL))) + goto error1; + } + + *O_dqpp = dqp; + return (0); + + error0: + ASSERT(error); + if (tp) + xfs_trans_cancel(tp, cancelflags); + error1: + xfs_qm_dqdestroy(dqp); + *O_dqpp = NULL; + return (error); +} + +/* + * Lookup a dquot in the incore dquot hashtable. We keep two separate + * hashtables for user and group dquots; and, these are global tables + * inside the XQM, not per-filesystem tables. + * The hash chain must be locked by caller, and it is left locked + * on return. Returning dquot is locked. + */ +STATIC int +xfs_qm_dqlookup( + xfs_mount_t *mp, + xfs_dqid_t id, + xfs_dqhash_t *qh, + xfs_dquot_t **O_dqpp) +{ + xfs_dquot_t *dqp; + uint flist_locked; + xfs_dquot_t *d; + + ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + + flist_locked = B_FALSE; + + /* + * Traverse the hashchain looking for a match + */ + for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { + /* + * We already have the hashlock. We don't need the + * dqlock to look at the id field of the dquot, since the + * id can't be modified without the hashlock anyway. + */ + if (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id && dqp->q_mount == mp) { + xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP"); + /* + * All in core dquots must be on the dqlist of mp + */ + ASSERT(dqp->MPL_PREVP != NULL); + + xfs_dqlock(dqp); + if (dqp->q_nrefs == 0) { + ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); + if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT"); + + /* + * We may have raced with dqreclaim_one() + * (and lost). So, flag that we don't + * want the dquot to be reclaimed. + */ + dqp->dq_flags |= XFS_DQ_WANT; + xfs_dqunlock(dqp); + xfs_qm_freelist_lock(xfs_Gqm); + xfs_dqlock(dqp); + dqp->dq_flags &= ~(XFS_DQ_WANT); + } + flist_locked = B_TRUE; + } + + /* + * id couldn't have changed; we had the hashlock all + * along + */ + ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id); + + if (flist_locked) { + if (dqp->q_nrefs != 0) { + xfs_qm_freelist_unlock(xfs_Gqm); + flist_locked = B_FALSE; + } else { + /* + * take it off the freelist + */ + xfs_dqtrace_entry(dqp, + "DQLOOKUP: TAKEOFF FL"); + XQM_FREELIST_REMOVE(dqp); + /* xfs_qm_freelist_print(&(xfs_Gqm-> + qm_dqfreelist), + "after removal"); */ + } + } + + /* + * grab a reference + */ + XFS_DQHOLD(dqp); + + if (flist_locked) + xfs_qm_freelist_unlock(xfs_Gqm); + /* + * move the dquot to the front of the hashchain + */ + ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + if (dqp->HL_PREVP != &qh->qh_next) { + xfs_dqtrace_entry(dqp, + "DQLOOKUP: HASH MOVETOFRONT"); + if ((d = dqp->HL_NEXT)) + d->HL_PREVP = dqp->HL_PREVP; + *(dqp->HL_PREVP) = d; + d = qh->qh_next; + d->HL_PREVP = &dqp->HL_NEXT; + dqp->HL_NEXT = d; + dqp->HL_PREVP = &qh->qh_next; + qh->qh_next = dqp; + } + xfs_dqtrace_entry(dqp, "LOOKUP END"); + *O_dqpp = dqp; + ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + return (0); + } + } + + *O_dqpp = NULL; + ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + return (1); +} + +/* + * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a + * a locked dquot, doing an allocation (if requested) as needed. + * When both an inode and an id are given, the inode's id takes precedence. + * That is, if the id changes while we don't hold the ilock inside this + * function, the new dquot is returned, not necessarily the one requested + * in the id argument. + */ +int +xfs_qm_dqget( + xfs_mount_t *mp, + xfs_inode_t *ip, /* locked inode (optional) */ + xfs_dqid_t id, /* gid or uid, depending on type */ + uint type, /* UDQUOT or GDQUOT */ + uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ + xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ +{ + xfs_dquot_t *dqp; + xfs_dqhash_t *h; + uint version; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { + return (ESRCH); + } + h = XFS_DQ_HASH(mp, id, type); + +#ifdef DEBUG + if (xfs_do_dqerror) { + if ((xfs_dqerror_target == mp->m_ddev_targp) && + (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { + cmn_err(CE_DEBUG, "Returning error in dqget"); + return (EIO); + } + } +#endif + + again: + +#ifdef DEBUG + ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP); + if (ip) { + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + if (type == XFS_DQ_USER) + ASSERT(ip->i_udquot == NULL); + else + ASSERT(ip->i_gdquot == NULL); + } +#endif + XFS_DQ_HASH_LOCK(h); + + /* + * Look in the cache (hashtable). + * The chain is kept locked during lookup. + */ + if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { + XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); + /* + * The dquot was found, moved to the front of the chain, + * taken off the freelist if it was on it, and locked + * at this point. Just unlock the hashchain and return. + */ + ASSERT(*O_dqpp); + ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); + XFS_DQ_HASH_UNLOCK(h); + xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); + return (0); /* success */ + } + XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + + /* + * Dquot cache miss. We don't want to keep the inode lock across + * a (potential) disk read. Also we don't want to deal with the lock + * ordering between quotainode and this inode. OTOH, dropping the inode + * lock here means dealing with a chown that can happen before + * we re-acquire the lock. + */ + if (ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * Save the hashchain version stamp, and unlock the chain, so that + * we don't keep the lock across a disk read + */ + version = h->qh_version; + XFS_DQ_HASH_UNLOCK(h); + + /* + * Allocate the dquot on the kernel heap, and read the ondisk + * portion off the disk. Also, do all the necessary initialization + * This can return ENOENT if dquot didn't exist on disk and we didn't + * ask it to allocate; ESRCH if quotas got turned off suddenly. + */ + if ((error = xfs_qm_idtodq(mp, id, type, + flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| + XFS_QMOPT_DOWARN), + &dqp))) { + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + return (error); + } + + /* + * See if this is mount code calling to look at the overall quota limits + * which are stored in the id == 0 user or group's dquot. + * Since we may not have done a quotacheck by this point, just return + * the dquot without attaching it to any hashtables, lists, etc, or even + * taking a reference. + * The caller must dqdestroy this once done. + */ + if (flags & XFS_QMOPT_DQSUSER) { + ASSERT(id == 0); + ASSERT(! ip); + goto dqret; + } + + /* + * Dquot lock comes after hashlock in the lock ordering + */ + if (ip) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (! XFS_IS_DQTYPE_ON(mp, type)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } + /* + * A dquot could be attached to this inode by now, since + * we had dropped the ilock. + */ + if (type == XFS_DQ_USER) { + if (ip->i_udquot) { + xfs_qm_dqdestroy(dqp); + dqp = ip->i_udquot; + xfs_dqlock(dqp); + goto dqret; + } + } else { + if (ip->i_gdquot) { + xfs_qm_dqdestroy(dqp); + dqp = ip->i_gdquot; + xfs_dqlock(dqp); + goto dqret; + } + } + } + + /* + * Hashlock comes after ilock in lock order + */ + XFS_DQ_HASH_LOCK(h); + if (version != h->qh_version) { + xfs_dquot_t *tmpdqp; + /* + * Now, see if somebody else put the dquot in the + * hashtable before us. This can happen because we didn't + * keep the hashchain lock. We don't have to worry about + * lock order between the two dquots here since dqp isn't + * on any findable lists yet. + */ + if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { + /* + * Duplicate found. Just throw away the new dquot + * and start over. + */ + xfs_qm_dqput(tmpdqp); + XFS_DQ_HASH_UNLOCK(h); + xfs_qm_dqdestroy(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); + goto again; + } + } + + /* + * Put the dquot at the beginning of the hash-chain and mp's list + * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. + */ + ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); + dqp->q_hash = h; + XQM_HASHLIST_INSERT(h, dqp); + + /* + * Attach this dquot to this filesystem's list of all dquots, + * kept inside the mount structure in m_quotainfo field + */ + xfs_qm_mplist_lock(mp); + + /* + * We return a locked dquot to the caller, with a reference taken + */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); + + xfs_qm_mplist_unlock(mp); + XFS_DQ_HASH_UNLOCK(h); + dqret: + ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip)); + xfs_dqtrace_entry(dqp, "DQGET DONE"); + *O_dqpp = dqp; + return (0); +} + + +/* + * Release a reference to the dquot (decrement ref-count) + * and unlock it. If there is a group quota attached to this + * dquot, carefully release that too without tripping over + * deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + xfs_dquot_t *dqp) +{ + xfs_dquot_t *gdqp; + + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + xfs_dqtrace_entry(dqp, "DQPUT"); + + if (dqp->q_nrefs != 1) { + dqp->q_nrefs--; + xfs_dqunlock(dqp); + return; + } + + /* + * drop the dqlock and acquire the freelist and dqlock + * in the right order; but try to get it out-of-order first + */ + if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT"); + xfs_dqunlock(dqp); + xfs_qm_freelist_lock(xfs_Gqm); + xfs_dqlock(dqp); + } + + while (1) { + gdqp = NULL; + + /* We can't depend on nrefs being == 1 here */ + if (--dqp->q_nrefs == 0) { + xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST"); + /* + * insert at end of the freelist. + */ + XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); + + /* + * If we just added a udquot to the freelist, then + * we want to release the gdquot reference that + * it (probably) has. Otherwise it'll keep the + * gdquot from getting reclaimed. + */ + if ((gdqp = dqp->q_gdquot)) { + /* + * Avoid a recursive dqput call + */ + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } + + /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), + "@@@@@++ Free list (after append) @@@@@+"); + */ + } + xfs_dqunlock(dqp); + + /* + * If we had a group quota inside the user quota as a hint, + * release it now. + */ + if (! gdqp) + break; + dqp = gdqp; + } + xfs_qm_freelist_unlock(xfs_Gqm); +} + +/* + * Release a dquot. Flush it if dirty, then dqput() it. + * dquot must not be locked. + */ +void +xfs_qm_dqrele( + xfs_dquot_t *dqp) +{ + ASSERT(dqp); + xfs_dqtrace_entry(dqp, "DQRELE"); + + xfs_dqlock(dqp); + /* + * We don't care to flush it if the dquot is dirty here. + * That will create stutters that we want to avoid. + * Instead we do a delayed write when we try to reclaim + * a dirty dquot. Also xfs_sync will take part of the burden... + */ + xfs_qm_dqput(dqp); +} + + +/* + * Write a modified dquot to disk. + * The dquot must be locked and the flush lock too taken by caller. + * The flush lock will not be unlocked until the dquot reaches the disk, + * but the dquot is free to be unlocked and modified by the caller + * in the interim. Dquot is still locked on return. This behavior is + * identical to that of inodes. + */ +int +xfs_qm_dqflush( + xfs_dquot_t *dqp, + uint flags) +{ + xfs_mount_t *mp; + xfs_buf_t *bp; + xfs_disk_dquot_t *ddqp; + int error; + SPLDECL(s); + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + xfs_dqtrace_entry(dqp, "DQFLUSH"); + + /* + * If not dirty, nada. + */ + if (!XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqfunlock(dqp); + return (0); + } + + /* + * Cant flush a pinned dquot. Wait for it. + */ + xfs_qm_dqunpin_wait(dqp); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this dquot + * to disk, because the log record didn't make it to disk! + */ + if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { + dqp->dq_flags &= ~(XFS_DQ_DIRTY); + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); + } + + /* + * Get the buffer containing the on-disk dquot + * We don't need a transaction envelope because we know that the + * the ondisk-dquot has already been allocated for. + */ + if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { + xfs_dqtrace_entry(dqp, "DQTOBP FAIL"); + ASSERT(error != ENOENT); + /* + * Quotas could have gotten turned off (ESRCH) + */ + xfs_dqfunlock(dqp); + return (error); + } + + if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN, + "dqflush (incore copy)")) { + xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE); + return XFS_ERROR(EIO); + } + + /* This is the only portion of data that needs to persist */ + memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); + + /* + * Clear the dirty field and remember the flush lsn for later use. + */ + dqp->dq_flags &= ~(XFS_DQ_DIRTY); + mp = dqp->q_mount; + + /* lsn is 64 bits */ + AIL_LOCK(mp, s); + dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn; + AIL_UNLOCK(mp, s); + + /* + * Attach an iodone routine so that we can remove this dquot from the + * AIL and release the flush lock once the dquot is synced to disk. + */ + xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) + xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (XFS_BUF_ISPINNED(bp)) { + xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE"); + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + } + + if (flags & XFS_QMOPT_DELWRI) { + xfs_bdwrite(mp, bp); + } else if (flags & XFS_QMOPT_ASYNC) { + xfs_bawrite(mp, bp); + } else { + error = xfs_bwrite(mp, bp); + } + xfs_dqtrace_entry(dqp, "DQFLUSH END"); + /* + * dqp is still locked, but caller is free to unlock it now. + */ + return (error); + +} + +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +/*ARGSUSED*/ +STATIC void +xfs_qm_dqflush_done( + xfs_buf_t *bp, + xfs_dq_logitem_t *qip) +{ + xfs_dquot_t *dqp; + SPLDECL(s); + + dqp = qip->qli_dquot; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && + qip->qli_item.li_lsn == qip->qli_flush_lsn) { + + AIL_LOCK(dqp->q_mount, s); + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + if (qip->qli_item.li_lsn == qip->qli_flush_lsn) + xfs_trans_delete_ail(dqp->q_mount, + (xfs_log_item_t*)qip, s); + else + AIL_UNLOCK(dqp->q_mount, s); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} + + +int +xfs_qm_dqflock_nowait( + xfs_dquot_t *dqp) +{ + int locked; + + locked = cpsema(&((dqp)->q_flock)); + + /* XXX ifdef these out */ + if (locked) + (dqp)->dq_flags |= XFS_DQ_FLOCKED; + return (locked); +} + + +int +xfs_qm_dqlock_nowait( + xfs_dquot_t *dqp) +{ + return (mutex_trylock(&((dqp)->q_qlock))); +} + +void +xfs_dqlock( + xfs_dquot_t *dqp) +{ + mutex_lock(&(dqp->q_qlock), PINOD); +} + +void +xfs_dqunlock( + xfs_dquot_t *dqp) +{ + mutex_unlock(&(dqp->q_qlock)); + if (dqp->q_logitem.qli_dquot == dqp) { + /* Once was dqp->q_mount, but might just have been cleared */ + xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp, + (xfs_log_item_t*)&(dqp->q_logitem)); + } +} + + +void +xfs_dqunlock_nonotify( + xfs_dquot_t *dqp) +{ + mutex_unlock(&(dqp->q_qlock)); +} + +void +xfs_dqlock2( + xfs_dquot_t *d1, + xfs_dquot_t *d2) +{ + if (d1 && d2) { + ASSERT(d1 != d2); + if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) { + xfs_dqlock(d2); + xfs_dqlock(d1); + } else { + xfs_dqlock(d1); + xfs_dqlock(d2); + } + } else { + if (d1) { + xfs_dqlock(d1); + } else if (d2) { + xfs_dqlock(d2); + } + } +} + + +/* + * Take a dquot out of the mount's dqlist as well as the hashlist. + * This is called via unmount as well as quotaoff, and the purge + * will always succeed unless there are soft (temp) references + * outstanding. + * + * This returns 0 if it was purged, 1 if it wasn't. It's not an error code + * that we're returning! XXXsup - not cool. + */ +/* ARGSUSED */ +int +xfs_qm_dqpurge( + xfs_dquot_t *dqp, + uint flags) +{ + xfs_dqhash_t *thishash; + xfs_mount_t *mp; + + mp = dqp->q_mount; + + ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); + + xfs_dqlock(dqp); + /* + * We really can't afford to purge a dquot that is + * referenced, because these are hard refs. + * It shouldn't happen in general because we went thru _all_ inodes in + * dqrele_all_inodes before calling this and didn't let the mountlock go. + * However it is possible that we have dquots with temporary + * references that are not attached to an inode. e.g. see xfs_setattr(). + */ + if (dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + XFS_DQ_HASH_UNLOCK(dqp->q_hash); + return (1); + } + + ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + + /* + * If we're turning off quotas, we have to make sure that, for + * example, we don't delete quota disk blocks while dquots are + * in the process of getting written to those disk blocks. + * This dquot might well be on AIL, and we can't leave it there + * if we're turning off quotas. Basically, we need this flush + * lock, and are willing to block on it. + */ + if (! xfs_qm_dqflock_nowait(dqp)) { + /* + * Block on the flush lock after nudging dquot buffer, + * if it is incore. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + + /* + * XXXIf we're turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); + /* dqflush unlocks dqflock */ + /* + * Given that dqpurge is a very rare occurrence, it is OK + * that we're holding the hashlist and mplist locks + * across the disk write. But, ... XXXsup + * + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); + xfs_dqflock(dqp); + } + ASSERT(dqp->q_pincount == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + thishash = dqp->q_hash; + XQM_HASHLIST_REMOVE(thishash, dqp); + XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); + /* + * XXX Move this to the front of the freelist, if we can get the + * freelist lock. + */ + ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + + dqp->q_mount = NULL; + dqp->q_hash = NULL; + dqp->dq_flags = XFS_DQ_INACTIVE; + memset(&dqp->q_core, 0, sizeof(dqp->q_core)); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + XFS_DQ_HASH_UNLOCK(thishash); + return (0); +} + + +#ifdef QUOTADEBUG +void +xfs_qm_dqprint(xfs_dquot_t *dqp) +{ + cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); + cmn_err(CE_DEBUG, "---- dquotID = %d", + (int)INT_GET(dqp->q_core.d_id, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- type = %s", + XFS_QM_ISUDQ(dqp) ? "USR" : "GRP"); + cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); + cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); + cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); + cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", + INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT), + (int) INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", + INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT), + (int)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", + INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT), + (int)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", + INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT), + (int)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", + INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), + (int)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), + (int)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- btimer = %d", + (int)INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---- itimer = %d", + (int)INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)); + cmn_err(CE_DEBUG, "---------------------------"); +} +#endif + +/* + * Give the buffer a little push if it is incore and + * wait on the flush lock. + */ +void +xfs_qm_dqflock_pushbuf_wait( + xfs_dquot_t *dqp) +{ + xfs_buf_t *bp; + + /* + * Check to see if the dquot has been flushed delayed + * write. If so, grab its buffer and send it + * out immediately. We'll be able to acquire + * the flush lock when the I/O completes. + */ + bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, + XFS_QI_DQCHUNKLEN(dqp->q_mount), + XFS_INCORE_TRYLOCK); + if (bp != NULL) { + if (XFS_BUF_ISDELAYWRITE(bp)) { + if (XFS_BUF_ISPINNED(bp)) { + xfs_log_force(dqp->q_mount, + (xfs_lsn_t)0, + XFS_LOG_FORCE); + } + xfs_bawrite(dqp->q_mount, bp); + } else { + xfs_buf_relse(bp); + } + } + xfs_dqflock(dqp); +} diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h new file mode 100644 index 00000000000..0c3fe3175ba --- /dev/null +++ b/fs/xfs/quota/xfs_dquot.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DQUOT_H__ +#define __XFS_DQUOT_H__ + +/* + * Dquots are structures that hold quota information about a user or a group, + * much like inodes are for files. In fact, dquots share many characteristics + * with inodes. However, dquots can also be a centralized resource, relative + * to a collection of inodes. In this respect, dquots share some characteristics + * of the superblock. + * XFS dquots exploit both those in its algorithms. They make every attempt + * to not be a bottleneck when quotas are on and have minimal impact, if any, + * when quotas are off. + */ + +/* + * The hash chain headers (hash buckets) + */ +typedef struct xfs_dqhash { + struct xfs_dquot *qh_next; + mutex_t qh_lock; + uint qh_version; /* ever increasing version */ + uint qh_nelems; /* number of dquots on the list */ +} xfs_dqhash_t; + +typedef struct xfs_dqlink { + struct xfs_dquot *ql_next; /* forward link */ + struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ +} xfs_dqlink_t; + +struct xfs_mount; +struct xfs_trans; + +/* + * This is the marker which is designed to occupy the first few + * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers + * must come first. + * This serves as the marker ("sentinel") when we have to restart list + * iterations because of locking considerations. + */ +typedef struct xfs_dqmarker { + struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ + struct xfs_dquot*dqm_flprev; + xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ + xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ + uint dqm_flags; /* various flags (XFS_DQ_*) */ +} xfs_dqmarker_t; + +/* + * The incore dquot structure + */ +typedef struct xfs_dquot { + xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ + xfs_dqhash_t *q_hash; /* the hashchain header */ + struct xfs_mount*q_mount; /* filesystem this relates to */ + struct xfs_trans*q_transp; /* trans this belongs to currently */ + uint q_nrefs; /* # active refs from inodes */ + xfs_daddr_t q_blkno; /* blkno of dquot buffer */ + int q_bufoffset; /* off of dq in buffer (# dquots) */ + xfs_fileoff_t q_fileoffset; /* offset in quotas file */ + + struct xfs_dquot*q_gdquot; /* group dquot, hint only */ + xfs_disk_dquot_t q_core; /* actual usage & quotas */ + xfs_dq_logitem_t q_logitem; /* dquot log item */ + xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ + xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + mutex_t q_qlock; /* quota lock */ + sema_t q_flock; /* flush lock */ + uint q_pincount; /* pin count for this dquot */ + sv_t q_pinwait; /* sync var for pinning */ +#ifdef XFS_DQUOT_TRACE + struct ktrace *q_trace; /* trace header structure */ +#endif +} xfs_dquot_t; + + +#define dq_flnext q_lists.dqm_flnext +#define dq_flprev q_lists.dqm_flprev +#define dq_mplist q_lists.dqm_mplist +#define dq_hashlist q_lists.dqm_hashlist +#define dq_flags q_lists.dqm_flags + +#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) + +/* + * Quota Accounting flags + */ +#define XFS_ALL_QUOTA_ACCT (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD) +#define XFS_ALL_QUOTA_ACTV (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) +#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) + +/* + * Quota Limit Enforcement flags + */ +#define XFS_IS_QUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) + +#ifdef DEBUG +static inline int +XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) +{ + if (mutex_trylock(&dqp->q_qlock)) { + mutex_unlock(&dqp->q_qlock); + return 0; + } + return 1; +} +#endif + + +/* + * The following three routines simply manage the q_flock + * semaphore embedded in the dquot. This semaphore synchronizes + * processes attempting to flush the in-core dquot back to disk. + */ +#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\ + (dqp)->dq_flags |= XFS_DQ_FLOCKED; } +#define xfs_dqfunlock(dqp) { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \ + vsema(&((dqp)->q_flock)); \ + (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); } + +#define XFS_DQ_PINLOCK(dqp) mutex_spinlock( \ + &(XFS_DQ_TO_QINF(dqp)->qi_pinlock)) +#define XFS_DQ_PINUNLOCK(dqp, s) mutex_spinunlock( \ + &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s) + +#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0) +#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) +#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ + XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ + XFS_DQ_TO_QINF(dqp)->qi_gquotaip) + +#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ + (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ + (XFS_IS_GQUOTA_ON((d)->q_mount)))) + +#ifdef XFS_DQUOT_TRACE +/* + * Dquot Tracing stuff. + */ +#define DQUOT_TRACE_SIZE 64 +#define DQUOT_KTRACE_ENTRY 1 + +extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func, + void *, xfs_inode_t *); +#define xfs_dqtrace_entry_ino(a,b,ip) \ + __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip)) +#define xfs_dqtrace_entry(a,b) \ + __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL) +#else +#define xfs_dqtrace_entry(a,b) +#define xfs_dqtrace_entry_ino(a,b,ip) +#endif + +#ifdef QUOTADEBUG +extern void xfs_qm_dqprint(xfs_dquot_t *); +#else +#define xfs_qm_dqprint(a) +#endif + +extern void xfs_qm_dqdestroy(xfs_dquot_t *); +extern int xfs_qm_dqflush(xfs_dquot_t *, uint); +extern int xfs_qm_dqpurge(xfs_dquot_t *, uint); +extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); +extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); +extern int xfs_qm_dqflock_nowait(xfs_dquot_t *); +extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); +extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, + xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, + xfs_disk_dquot_t *); +extern int xfs_qm_dqwarn(xfs_disk_dquot_t *, uint); +extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, + xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern void xfs_qm_dqput(xfs_dquot_t *); +extern void xfs_qm_dqrele(xfs_dquot_t *); +extern void xfs_dqlock(xfs_dquot_t *); +extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); +extern void xfs_dqunlock(xfs_dquot_t *); +extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +#endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c new file mode 100644 index 00000000000..a5425ee6e7b --- /dev/null +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -0,0 +1,715 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" + +#include "xfs_qm.h" + + +/* + * returns the number of iovecs needed to log the given dquot item. + */ +/* ARGSUSED */ +STATIC uint +xfs_qm_dquot_logitem_size( + xfs_dq_logitem_t *logitem) +{ + /* + * we need only two iovecs, one for the format, one for the real thing + */ + return (2); +} + +/* + * fills in the vector of log iovecs for the given dquot log item. + */ +STATIC void +xfs_qm_dquot_logitem_format( + xfs_dq_logitem_t *logitem, + xfs_log_iovec_t *logvec) +{ + ASSERT(logitem); + ASSERT(logitem->qli_dquot); + + logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; + logvec->i_len = sizeof(xfs_dq_logformat_t); + logvec++; + logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; + logvec->i_len = sizeof(xfs_disk_dquot_t); + + ASSERT(2 == logitem->qli_item.li_desc->lid_size); + logitem->qli_format.qlf_size = 2; + +} + +/* + * Increment the pin count of the given dquot. + * This value is protected by pinlock spinlock in the xQM structure. + */ +STATIC void +xfs_qm_dquot_logitem_pin( + xfs_dq_logitem_t *logitem) +{ + unsigned long s; + xfs_dquot_t *dqp; + + dqp = logitem->qli_dquot; + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + s = XFS_DQ_PINLOCK(dqp); + dqp->q_pincount++; + XFS_DQ_PINUNLOCK(dqp, s); +} + +/* + * Decrement the pin count of the given dquot, and wake up + * anyone in xfs_dqwait_unpin() if the count goes to 0. The + * dquot must have been previously pinned with a call to xfs_dqpin(). + */ +/* ARGSUSED */ +STATIC void +xfs_qm_dquot_logitem_unpin( + xfs_dq_logitem_t *logitem, + int stale) +{ + unsigned long s; + xfs_dquot_t *dqp; + + dqp = logitem->qli_dquot; + ASSERT(dqp->q_pincount > 0); + s = XFS_DQ_PINLOCK(dqp); + dqp->q_pincount--; + if (dqp->q_pincount == 0) { + sv_broadcast(&dqp->q_pinwait); + } + XFS_DQ_PINUNLOCK(dqp, s); +} + +/* ARGSUSED */ +STATIC void +xfs_qm_dquot_logitem_unpin_remove( + xfs_dq_logitem_t *logitem, + xfs_trans_t *tp) +{ + xfs_qm_dquot_logitem_unpin(logitem, 0); +} + +/* + * Given the logitem, this writes the corresponding dquot entry to disk + * asynchronously. This is called with the dquot entry securely locked; + * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot + * at the end. + */ +STATIC void +xfs_qm_dquot_logitem_push( + xfs_dq_logitem_t *logitem) +{ + xfs_dquot_t *dqp; + + dqp = logitem->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); + + /* + * Since we were able to lock the dquot's flush lock and + * we found it on the AIL, the dquot must be dirty. This + * is because the dquot is removed from the AIL while still + * holding the flush lock in xfs_dqflush_done(). Thus, if + * we found it in the AIL and were able to obtain the flush + * lock without sleeping, then there must not have been + * anyone in the process of flushing the dquot. + */ + xfs_qm_dqflush(dqp, XFS_B_DELWRI); + xfs_dqunlock(dqp); +} + +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_qm_dquot_logitem_committed( + xfs_dq_logitem_t *l, + xfs_lsn_t lsn) +{ + /* + * We always re-log the entire dquot when it becomes dirty, + * so, the latest copy _is_ the only one that matters. + */ + return (lsn); +} + + +/* + * This is called to wait for the given dquot to be unpinned. + * Most of these pin/unpin routines are plagiarized from inode code. + */ +void +xfs_qm_dqunpin_wait( + xfs_dquot_t *dqp) +{ + SPLDECL(s); + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (dqp->q_pincount == 0) { + return; + } + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); + s = XFS_DQ_PINLOCK(dqp); + if (dqp->q_pincount == 0) { + XFS_DQ_PINUNLOCK(dqp, s); + return; + } + sv_wait(&(dqp->q_pinwait), PINOD, + &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s); +} + +/* + * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that + * the dquot is locked by us, but the flush lock isn't. So, here we are + * going to see if the relevant dquot buffer is incore, waiting on DELWRI. + * If so, we want to push it out to help us take this item off the AIL as soon + * as possible. + * + * We must not be holding the AIL_LOCK at this point. Calling incore() to + * search the buffercache can be a time consuming thing, and AIL_LOCK is a + * spinlock. + */ +STATIC void +xfs_qm_dquot_logitem_pushbuf( + xfs_dq_logitem_t *qip) +{ + xfs_dquot_t *dqp; + xfs_mount_t *mp; + xfs_buf_t *bp; + uint dopush; + + dqp = qip->qli_dquot; + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * The qli_pushbuf_flag keeps others from + * trying to duplicate our effort. + */ + ASSERT(qip->qli_pushbuf_flag != 0); + ASSERT(qip->qli_push_owner == get_thread_id()); + + /* + * If flushlock isn't locked anymore, chances are that the + * inode flush completed and the inode was taken off the AIL. + * So, just get out. + */ + if ((valusema(&(dqp->q_flock)) > 0) || + ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { + qip->qli_pushbuf_flag = 0; + xfs_dqunlock(dqp); + return; + } + mp = dqp->q_mount; + bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, + XFS_QI_DQCHUNKLEN(mp), + XFS_INCORE_TRYLOCK); + if (bp != NULL) { + if (XFS_BUF_ISDELAYWRITE(bp)) { + dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && + (valusema(&(dqp->q_flock)) <= 0)); + qip->qli_pushbuf_flag = 0; + xfs_dqunlock(dqp); + + if (XFS_BUF_ISPINNED(bp)) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE); + } + if (dopush) { +#ifdef XFSRACEDEBUG + delay_for_intr(); + delay(300); +#endif + xfs_bawrite(mp, bp); + } else { + xfs_buf_relse(bp); + } + } else { + qip->qli_pushbuf_flag = 0; + xfs_dqunlock(dqp); + xfs_buf_relse(bp); + } + return; + } + + qip->qli_pushbuf_flag = 0; + xfs_dqunlock(dqp); +} + +/* + * This is called to attempt to lock the dquot associated with this + * dquot log item. Don't sleep on the dquot lock or the flush lock. + * If the flush lock is already held, indicating that the dquot has + * been or is in the process of being flushed, then see if we can + * find the dquot's buffer in the buffer cache without sleeping. If + * we can and it is marked delayed write, then we want to send it out. + * We delay doing so until the push routine, though, to avoid sleeping + * in any device strategy routines. + */ +STATIC uint +xfs_qm_dquot_logitem_trylock( + xfs_dq_logitem_t *qip) +{ + xfs_dquot_t *dqp; + uint retval; + + dqp = qip->qli_dquot; + if (dqp->q_pincount > 0) + return (XFS_ITEM_PINNED); + + if (! xfs_qm_dqlock_nowait(dqp)) + return (XFS_ITEM_LOCKED); + + retval = XFS_ITEM_SUCCESS; + if (! xfs_qm_dqflock_nowait(dqp)) { + /* + * The dquot is already being flushed. It may have been + * flushed delayed write, however, and we don't want to + * get stuck waiting for that to complete. So, we want to check + * to see if we can lock the dquot's buffer without sleeping. + * If we can and it is marked for delayed write, then we + * hold it and send it out from the push routine. We don't + * want to do that now since we might sleep in the device + * strategy routine. We also don't want to grab the buffer lock + * here because we'd like not to call into the buffer cache + * while holding the AIL_LOCK. + * Make sure to only return PUSHBUF if we set pushbuf_flag + * ourselves. If someone else is doing it then we don't + * want to go to the push routine and duplicate their efforts. + */ + if (qip->qli_pushbuf_flag == 0) { + qip->qli_pushbuf_flag = 1; + ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); +#ifdef DEBUG + qip->qli_push_owner = get_thread_id(); +#endif + /* + * The dquot is left locked. + */ + retval = XFS_ITEM_PUSHBUF; + } else { + retval = XFS_ITEM_FLUSHING; + xfs_dqunlock_nonotify(dqp); + } + } + + ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); + return (retval); +} + + +/* + * Unlock the dquot associated with the log item. + * Clear the fields of the dquot and dquot log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the dquot. + */ +STATIC void +xfs_qm_dquot_logitem_unlock( + xfs_dq_logitem_t *ql) +{ + xfs_dquot_t *dqp; + + ASSERT(ql != NULL); + dqp = ql->qli_dquot; + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * Clear the transaction pointer in the dquot + */ + dqp->q_transp = NULL; + + /* + * dquots are never 'held' from getting unlocked at the end of + * a transaction. Their locking and unlocking is hidden inside the + * transaction layer, within trans_commit. Hence, no LI_HOLD flag + * for the logitem. + */ + xfs_dqunlock(dqp); +} + + +/* + * The transaction with the dquot locked has aborted. The dquot + * must not be dirty within the transaction. We simply unlock just + * as if the transaction had been cancelled. + */ +STATIC void +xfs_qm_dquot_logitem_abort( + xfs_dq_logitem_t *ql) +{ + xfs_qm_dquot_logitem_unlock(ql); +} + +/* + * this needs to stamp an lsn into the dquot, I think. + * rpc's that look at user dquot's would then have to + * push on the dependency recorded in the dquot + */ +/* ARGSUSED */ +STATIC void +xfs_qm_dquot_logitem_committing( + xfs_dq_logitem_t *l, + xfs_lsn_t lsn) +{ + return; +} + + +/* + * This is the ops vector for dquots + */ +struct xfs_item_ops xfs_dquot_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_qm_dquot_logitem_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int)) + xfs_qm_dquot_logitem_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) + xfs_qm_dquot_logitem_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*)) + xfs_qm_dquot_logitem_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_dquot_logitem_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort, + .iop_pushbuf = (void(*)(xfs_log_item_t*)) + xfs_qm_dquot_logitem_pushbuf, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_dquot_logitem_committing +}; + +/* + * Initialize the dquot log item for a newly allocated dquot. + * The dquot isn't locked at this point, but it isn't on any of the lists + * either, so we don't care. + */ +void +xfs_qm_dquot_logitem_init( + struct xfs_dquot *dqp) +{ + xfs_dq_logitem_t *lp; + lp = &dqp->q_logitem; + + lp->qli_item.li_type = XFS_LI_DQUOT; + lp->qli_item.li_ops = &xfs_dquot_item_ops; + lp->qli_item.li_mountp = dqp->q_mount; + lp->qli_dquot = dqp; + lp->qli_format.qlf_type = XFS_LI_DQUOT; + lp->qli_format.qlf_id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT); + lp->qli_format.qlf_blkno = dqp->q_blkno; + lp->qli_format.qlf_len = 1; + /* + * This is just the offset of this dquot within its buffer + * (which is currently 1 FSB and probably won't change). + * Hence 32 bits for this offset should be just fine. + * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) + * here, and recompute it at recovery time. + */ + lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; +} + +/*------------------ QUOTAOFF LOG ITEMS -------------------*/ + +/* + * This returns the number of iovecs needed to log the given quotaoff item. + * We only need 1 iovec for an quotaoff item. It just logs the + * quotaoff_log_format structure. + */ +/*ARGSUSED*/ +STATIC uint +xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) +{ + return (1); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given quotaoff log item. We use only 1 iovec, and we point that + * at the quotaoff_log_format structure embedded in the quotaoff item. + * It is at this point that we assert that all of the extent + * slots in the quotaoff item have been filled. + */ +STATIC void +xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, + xfs_log_iovec_t *log_vector) +{ + ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); + + log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); + log_vector->i_len = sizeof(xfs_qoff_logitem_t); + qf->qql_format.qf_size = 1; +} + + +/* + * Pinning has no meaning for an quotaoff item, so just return. + */ +/*ARGSUSED*/ +STATIC void +xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) +{ + return; +} + + +/* + * Since pinning has no meaning for an quotaoff item, unpinning does + * not either. + */ +/*ARGSUSED*/ +STATIC void +xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) +{ + return; +} + +/*ARGSUSED*/ +STATIC void +xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp) +{ + return; +} + +/* + * Quotaoff items have no locking, so just return success. + */ +/*ARGSUSED*/ +STATIC uint +xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Quotaoff items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +/*ARGSUSED*/ +STATIC void +xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) +{ + return; +} + +/* + * The quotaoff-start-item is logged only once and cannot be moved in the log, + * so simply return the lsn at which it's been logged. + */ +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) +{ + return (lsn); +} + +/* + * The transaction of which this QUOTAOFF is a part has been aborted. + * Just clean up after ourselves. + * Shouldn't this never happen in the case of qoffend logitems? XXX + */ +STATIC void +xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf) +{ + kmem_free(qf, sizeof(xfs_qoff_logitem_t)); +} + +/* + * There isn't much you can do to push on an quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +/*ARGSUSED*/ +STATIC void +xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) +{ + return; +} + + +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_qm_qoffend_logitem_committed( + xfs_qoff_logitem_t *qfe, + xfs_lsn_t lsn) +{ + xfs_qoff_logitem_t *qfs; + SPLDECL(s); + + qfs = qfe->qql_start_lip; + AIL_LOCK(qfs->qql_item.li_mountp,s); + /* + * Delete the qoff-start logitem from the AIL. + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s); + kmem_free(qfs, sizeof(xfs_qoff_logitem_t)); + kmem_free(qfe, sizeof(xfs_qoff_logitem_t)); + return (xfs_lsn_t)-1; +} + +/* + * XXX rcc - don't know quite what to do with this. I think we can + * just ignore it. The only time that isn't the case is if we allow + * the client to somehow see that quotas have been turned off in which + * we can't allow that to get back until the quotaoff hits the disk. + * So how would that happen? Also, do we need different routines for + * quotaoff start and quotaoff end? I suspect the answer is yes but + * to be sure, I need to look at the recovery code and see how quota off + * recovery is handled (do we roll forward or back or do something else). + * If we roll forwards or backwards, then we need two separate routines, + * one that does nothing and one that stamps in the lsn that matters + * (truly makes the quotaoff irrevocable). If we do something else, + * then maybe we don't need two. + */ +/* ARGSUSED */ +STATIC void +xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) +{ + return; +} + +/* ARGSUSED */ +STATIC void +xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) +{ + return; +} + +struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_qm_qoff_logitem_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, + .iop_unpin = (void(*)(xfs_log_item_t* ,int)) + xfs_qm_qoff_logitem_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) + xfs_qm_qoff_logitem_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_qoffend_logitem_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_qoffend_logitem_committing +}; + +/* + * This is the ops vector shared by all quotaoff-start log items. + */ +struct xfs_item_ops xfs_qm_qoff_logitem_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_qm_qoff_logitem_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int)) + xfs_qm_qoff_logitem_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) + xfs_qm_qoff_logitem_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_qoff_logitem_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_qm_qoff_logitem_committing +}; + +/* + * Allocate and initialize an quotaoff item of the correct quota type(s). + */ +xfs_qoff_logitem_t * +xfs_qm_qoff_logitem_init( + struct xfs_mount *mp, + xfs_qoff_logitem_t *start, + uint flags) +{ + xfs_qoff_logitem_t *qf; + + qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); + + qf->qql_item.li_type = XFS_LI_QUOTAOFF; + if (start) + qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; + else + qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; + qf->qql_item.li_mountp = mp; + qf->qql_format.qf_type = XFS_LI_QUOTAOFF; + qf->qql_format.qf_flags = flags; + qf->qql_start_lip = start; + return (qf); +} diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h new file mode 100644 index 00000000000..9c6500dabca --- /dev/null +++ b/fs/xfs/quota/xfs_dquot_item.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DQUOT_ITEM_H__ +#define __XFS_DQUOT_ITEM_H__ + +struct xfs_dquot; +struct xfs_trans; +struct xfs_mount; +struct xfs_qoff_logitem; + +typedef struct xfs_dq_logitem { + xfs_log_item_t qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ + unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */ +#ifdef DEBUG + uint64_t qli_push_owner; +#endif + xfs_dq_logformat_t qli_format; /* logged structure */ +} xfs_dq_logitem_t; + +typedef struct xfs_qoff_logitem { + xfs_log_item_t qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ + xfs_qoff_logformat_t qql_format; /* logged structure */ +} xfs_qoff_logitem_t; + + +extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); +extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, + struct xfs_qoff_logitem *, uint); +extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *, uint); +extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *); + +#endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c new file mode 100644 index 00000000000..89f2cd656eb --- /dev/null +++ b/fs/xfs/quota/xfs_qm.c @@ -0,0 +1,2848 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" + +#include "xfs_qm.h" + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +mutex_t xfs_Gqm_lock; +struct xfs_qm *xfs_Gqm; + +kmem_zone_t *qm_dqzone; +kmem_zone_t *qm_dqtrxzone; +kmem_shaker_t xfs_qm_shaker; + +STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); +STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); + +STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_shake(int, unsigned int); + +#ifdef DEBUG +extern mutex_t qcheck_lock; +#endif + +#ifdef QUOTADEBUG +#define XQM_LIST_PRINT(l, NXT, title) \ +{ \ + xfs_dquot_t *dqp; int i = 0; \ + cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ + for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ + "bcnt = %d, icnt = %d, refs = %d", \ + ++i, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \ + DQFLAGTO_TYPESTR(dqp), \ + (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \ + (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \ + (int) dqp->q_nrefs); } \ +} +#else +#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) +#endif + +/* + * Initialize the XQM structure. + * Note that there is not one quota manager per file system. + */ +STATIC struct xfs_qm * +xfs_Gqm_init(void) +{ + xfs_qm_t *xqm; + int hsize, i; + + xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); + ASSERT(xqm); + + /* + * Initialize the dquot hash tables. + */ + hsize = (DQUOT_HASH_HEURISTIC < XFS_QM_NCSIZE_THRESHOLD) ? + XFS_QM_HASHSIZE_LOW : XFS_QM_HASHSIZE_HIGH; + xqm->qm_dqhashmask = hsize - 1; + + xqm->qm_usr_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize * + sizeof(xfs_dqhash_t), + KM_SLEEP); + xqm->qm_grp_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize * + sizeof(xfs_dqhash_t), + KM_SLEEP); + ASSERT(xqm->qm_usr_dqhtable != NULL); + ASSERT(xqm->qm_grp_dqhtable != NULL); + + for (i = 0; i < hsize; i++) { + xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); + xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); + } + + /* + * Freelist of all dquots of all file systems + */ + xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); + + /* + * dquot zone. we register our own low-memory callback. + */ + if (!qm_dqzone) { + xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t), + "xfs_dquots"); + qm_dqzone = xqm->qm_dqzone; + } else + xqm->qm_dqzone = qm_dqzone; + + xfs_qm_shaker = kmem_shake_register(xfs_qm_shake); + + /* + * The t_dqinfo portion of transactions. + */ + if (!qm_dqtrxzone) { + xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t), + "xfs_dqtrx"); + qm_dqtrxzone = xqm->qm_dqtrxzone; + } else + xqm->qm_dqtrxzone = qm_dqtrxzone; + + atomic_set(&xqm->qm_totaldquots, 0); + xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; + xqm->qm_nrefs = 0; +#ifdef DEBUG + mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk"); +#endif + return xqm; +} + +/* + * Destroy the global quota manager when its reference count goes to zero. + */ +void +xfs_qm_destroy( + struct xfs_qm *xqm) +{ + int hsize, i; + + ASSERT(xqm != NULL); + ASSERT(xqm->qm_nrefs == 0); + kmem_shake_deregister(xfs_qm_shaker); + hsize = xqm->qm_dqhashmask + 1; + for (i = 0; i < hsize; i++) { + xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); + xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); + } + kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t)); + kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t)); + xqm->qm_usr_dqhtable = NULL; + xqm->qm_grp_dqhtable = NULL; + xqm->qm_dqhashmask = 0; + xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); +#ifdef DEBUG + mutex_destroy(&qcheck_lock); +#endif + kmem_free(xqm, sizeof(xfs_qm_t)); +} + +/* + * Called at mount time to let XQM know that another file system is + * starting quotas. This isn't crucial information as the individual mount + * structures are pretty independent, but it helps the XQM keep a + * global view of what's going on. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_hold_quotafs_ref( + struct xfs_mount *mp) +{ + /* + * Need to lock the xfs_Gqm structure for things like this. For example, + * the structure could disappear between the entry to this routine and + * a HOLD operation if not locked. + */ + XFS_QM_LOCK(xfs_Gqm); + + if (xfs_Gqm == NULL) + xfs_Gqm = xfs_Gqm_init(); + /* + * We can keep a list of all filesystems with quotas mounted for + * debugging and statistical purposes, but ... + * Just take a reference and get out. + */ + XFS_QM_HOLD(xfs_Gqm); + XFS_QM_UNLOCK(xfs_Gqm); + + return 0; +} + + +/* + * Release the reference that a filesystem took at mount time, + * so that we know when we need to destroy the entire quota manager. + */ +/* ARGSUSED */ +STATIC void +xfs_qm_rele_quotafs_ref( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp, *nextdqp; + + ASSERT(xfs_Gqm); + ASSERT(xfs_Gqm->qm_nrefs > 0); + + /* + * Go thru the freelist and destroy all inactive dquots. + */ + xfs_qm_freelist_lock(xfs_Gqm); + + for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; + dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { + xfs_dqlock(dqp); + nextdqp = dqp->dq_flnext; + if (dqp->dq_flags & XFS_DQ_INACTIVE) { + ASSERT(dqp->q_mount == NULL); + ASSERT(! XFS_DQ_IS_DIRTY(dqp)); + ASSERT(dqp->HL_PREVP == NULL); + ASSERT(dqp->MPL_PREVP == NULL); + XQM_FREELIST_REMOVE(dqp); + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } else { + xfs_dqunlock(dqp); + } + dqp = nextdqp; + } + xfs_qm_freelist_unlock(xfs_Gqm); + + /* + * Destroy the entire XQM. If somebody mounts with quotaon, this'll + * be restarted. + */ + XFS_QM_LOCK(xfs_Gqm); + XFS_QM_RELE(xfs_Gqm); + if (xfs_Gqm->qm_nrefs == 0) { + xfs_qm_destroy(xfs_Gqm); + xfs_Gqm = NULL; + } + XFS_QM_UNLOCK(xfs_Gqm); +} + +/* + * This is called at mount time from xfs_mountfs to initialize the quotainfo + * structure and start the global quotamanager (xfs_Gqm) if it hasn't done + * so already. Note that the superblock has not been read in yet. + */ +void +xfs_qm_mount_quotainit( + xfs_mount_t *mp, + uint flags) +{ + /* + * User or group quotas has to be on. + */ + ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA)); + + /* + * Initialize the flags in the mount structure. From this point + * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc. + * Note that we enforce nothing if accounting is off. + * ie. XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF. + * It isn't necessary to take the quotaoff lock to do this; this is + * called from mount. + */ + if (flags & XFSMNT_UQUOTA) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + if (flags & XFSMNT_UQUOTAENF) + mp->m_qflags |= XFS_UQUOTA_ENFD; + } + if (flags & XFSMNT_GQUOTA) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + if (flags & XFSMNT_GQUOTAENF) + mp->m_qflags |= XFS_GQUOTA_ENFD; + } +} + +/* + * Just destroy the quotainfo structure. + */ +void +xfs_qm_unmount_quotadestroy( + xfs_mount_t *mp) +{ + if (mp->m_quotainfo) + xfs_qm_destroy_quotainfo(mp); +} + + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + */ +int +xfs_qm_mount_quotas( + xfs_mount_t *mp, + int mfsi_flags) +{ + unsigned long s; + int error = 0; + uint sbf; + + /* + * If a file system had quotas running earlier, but decided to + * mount without -o quota/uquota/gquota options, revoke the + * quotachecked license, and bail out. + */ + if (! XFS_IS_QUOTA_ON(mp) && + (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) { + mp->m_qflags = 0; + goto write_changes; + } + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + cmn_err(CE_NOTE, + "Cannot turn on quotas for realtime filesystem %s", + mp->m_fsname); + mp->m_qflags = 0; + goto write_changes; + } + +#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) + cmn_err(CE_NOTE, "Attempting to turn on disk quotas."); +#endif + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + if ((error = xfs_qm_init_quotainfo(mp))) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp) && + !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { +#ifdef DEBUG + cmn_err(CE_NOTE, "Doing a quotacheck. Please wait."); +#endif + if ((error = xfs_qm_quotacheck(mp))) { + /* Quotacheck has failed and quotas have + * been disabled. + */ + return XFS_ERROR(error); + } +#ifdef DEBUG + cmn_err(CE_NOTE, "Done quotacheck."); +#endif + } + write_changes: + /* + * We actually don't have to acquire the SB_LOCK at all. + * This can only be called from mount, and that's single threaded. XXX + */ + s = XFS_SB_LOCK(mp); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + XFS_SB_UNLOCK(mp, s); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_fs_cmn_err(CE_ALERT, mp, + "XFS mount_quotas: Superblock update failed!"); + } + } + + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, + "Failed to initialize disk quotas."); + } + return XFS_ERROR(error); +} + +/* + * Called from the vfsops layer. + */ +int +xfs_qm_unmount_quotas( + xfs_mount_t *mp) +{ + xfs_inode_t *uqp, *gqp; + int error = 0; + + /* + * Release the dquots that root inode, et al might be holding, + * before we flush quotas and blow away the quotainfo structure. + */ + ASSERT(mp->m_rootip); + xfs_qm_dqdetach(mp->m_rootip); + if (mp->m_rbmip) + xfs_qm_dqdetach(mp->m_rbmip); + if (mp->m_rsumip) + xfs_qm_dqdetach(mp->m_rsumip); + + /* + * Flush out the quota inodes. + */ + uqp = gqp = NULL; + if (mp->m_quotainfo) { + if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) { + xfs_ilock(uqp, XFS_ILOCK_EXCL); + xfs_iflock(uqp); + error = xfs_iflush(uqp, XFS_IFLUSH_SYNC); + xfs_iunlock(uqp, XFS_ILOCK_EXCL); + if (unlikely(error == EFSCORRUPTED)) { + XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)", + XFS_ERRLEVEL_LOW, mp); + goto out; + } + } + if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) { + xfs_ilock(gqp, XFS_ILOCK_EXCL); + xfs_iflock(gqp); + error = xfs_iflush(gqp, XFS_IFLUSH_SYNC); + xfs_iunlock(gqp, XFS_ILOCK_EXCL); + if (unlikely(error == EFSCORRUPTED)) { + XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)", + XFS_ERRLEVEL_LOW, mp); + goto out; + } + } + } + if (uqp) { + XFS_PURGE_INODE(uqp); + mp->m_quotainfo->qi_uquotaip = NULL; + } + if (gqp) { + XFS_PURGE_INODE(gqp); + mp->m_quotainfo->qi_gquotaip = NULL; + } +out: + return XFS_ERROR(error); +} + +/* + * Flush all dquots of the given file system to disk. The dquots are + * _not_ purged from memory here, just their data written to disk. + */ +int +xfs_qm_dqflush_all( + xfs_mount_t *mp, + int flags) +{ + int recl; + xfs_dquot_t *dqp; + int niters; + int error; + + if (mp->m_quotainfo == NULL) + return (0); + niters = 0; +again: + xfs_qm_mplist_lock(mp); + FOREACH_DQUOT_IN_MP(dqp, mp) { + xfs_dqlock(dqp); + if (! XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqunlock(dqp); + continue; + } + xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); + /* XXX a sentinel would be better */ + recl = XFS_QI_MPLRECLAIMS(mp); + if (! xfs_qm_dqflock_nowait(dqp)) { + /* + * If we can't grab the flush lock then check + * to see if the dquot has been flushed delayed + * write. If so, grab its buffer and send it + * out immediately. We'll be able to acquire + * the flush lock when the I/O completes. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + /* + * Let go of the mplist lock. We don't want to hold it + * across a disk write. + */ + xfs_qm_mplist_unlock(mp); + error = xfs_qm_dqflush(dqp, flags); + xfs_dqunlock(dqp); + if (error) + return (error); + + xfs_qm_mplist_lock(mp); + if (recl != XFS_QI_MPLRECLAIMS(mp)) { + xfs_qm_mplist_unlock(mp); + /* XXX restart limit */ + goto again; + } + } + + xfs_qm_mplist_unlock(mp); + /* return ! busy */ + return (0); +} +/* + * Release the group dquot pointers the user dquots may be + * carrying around as a hint. mplist is locked on entry and exit. + */ +STATIC void +xfs_qm_detach_gdquots( + xfs_mount_t *mp) +{ + xfs_dquot_t *dqp, *gdqp; + int nrecl; + + again: + ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + dqp = XFS_QI_MPLNEXT(mp); + while (dqp) { + xfs_dqlock(dqp); + if ((gdqp = dqp->q_gdquot)) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; + } + xfs_dqunlock(dqp); + + if (gdqp) { + /* + * Can't hold the mplist lock across a dqput. + * XXXmust convert to marker based iterations here. + */ + nrecl = XFS_QI_MPLRECLAIMS(mp); + xfs_qm_mplist_unlock(mp); + xfs_qm_dqput(gdqp); + + xfs_qm_mplist_lock(mp); + if (nrecl != XFS_QI_MPLRECLAIMS(mp)) + goto again; + } + dqp = dqp->MPL_NEXT; + } +} + +/* + * Go through all the incore dquots of this file system and take them + * off the mplist and hashlist, if the dquot type matches the dqtype + * parameter. This is used when turning off quota accounting for + * users and/or groups, as well as when the filesystem is unmounting. + */ +STATIC int +xfs_qm_dqpurge_int( + xfs_mount_t *mp, + uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */ +{ + xfs_dquot_t *dqp; + uint dqtype; + int nrecl; + xfs_dquot_t *nextdqp; + int nmisses; + + if (mp->m_quotainfo == NULL) + return (0); + + dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; + dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; + + xfs_qm_mplist_lock(mp); + + /* + * In the first pass through all incore dquots of this filesystem, + * we release the group dquot pointers the user dquots may be + * carrying around as a hint. We need to do this irrespective of + * what's being turned off. + */ + xfs_qm_detach_gdquots(mp); + + again: + nmisses = 0; + ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + /* + * Try to get rid of all of the unwanted dquots. The idea is to + * get them off mplist and hashlist, but leave them on freelist. + */ + dqp = XFS_QI_MPLNEXT(mp); + while (dqp) { + /* + * It's OK to look at the type without taking dqlock here. + * We're holding the mplist lock here, and that's needed for + * a dqreclaim. + */ + if ((dqp->dq_flags & dqtype) == 0) { + dqp = dqp->MPL_NEXT; + continue; + } + + if (! xfs_qm_dqhashlock_nowait(dqp)) { + nrecl = XFS_QI_MPLRECLAIMS(mp); + xfs_qm_mplist_unlock(mp); + XFS_DQ_HASH_LOCK(dqp->q_hash); + xfs_qm_mplist_lock(mp); + + /* + * XXXTheoretically, we can get into a very long + * ping pong game here. + * No one can be adding dquots to the mplist at + * this point, but somebody might be taking things off. + */ + if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { + XFS_DQ_HASH_UNLOCK(dqp->q_hash); + goto again; + } + } + + /* + * Take the dquot off the mplist and hashlist. It may remain on + * freelist in INACTIVE state. + */ + nextdqp = dqp->MPL_NEXT; + nmisses += xfs_qm_dqpurge(dqp, flags); + dqp = nextdqp; + } + xfs_qm_mplist_unlock(mp); + return nmisses; +} + +int +xfs_qm_dqpurge_all( + xfs_mount_t *mp, + uint flags) +{ + int ndquots; + + /* + * Purge the dquot cache. + * None of the dquots should really be busy at this point. + */ + if (mp->m_quotainfo) { + while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) { + delay(ndquots * 10); + } + } + return 0; +} + +STATIC int +xfs_qm_dqattach_one( + xfs_inode_t *ip, + xfs_dqid_t id, + uint type, + uint doalloc, + uint dolock, + xfs_dquot_t *udqhint, /* hint */ + xfs_dquot_t **IO_idqpp) +{ + xfs_dquot_t *dqp; + int error; + + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + error = 0; + /* + * See if we already have it in the inode itself. IO_idqpp is + * &i_udquot or &i_gdquot. This made the code look weird, but + * made the logic a lot simpler. + */ + if ((dqp = *IO_idqpp)) { + if (dolock) + xfs_dqlock(dqp); + xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); + goto done; + } + + /* + * udqhint is the i_udquot field in inode, and is non-NULL only + * when the type arg is XFS_DQ_GROUP. Its purpose is to save a + * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside + * the user dquot. + */ + ASSERT(!udqhint || type == XFS_DQ_GROUP); + if (udqhint && !dolock) + xfs_dqlock(udqhint); + + /* + * No need to take dqlock to look at the id. + * The ID can't change until it gets reclaimed, and it won't + * be reclaimed as long as we have a ref from inode and we hold + * the ilock. + */ + if (udqhint && + (dqp = udqhint->q_gdquot) && + (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id)) { + ASSERT(XFS_DQ_IS_LOCKED(udqhint)); + xfs_dqlock(dqp); + XFS_DQHOLD(dqp); + ASSERT(*IO_idqpp == NULL); + *IO_idqpp = dqp; + if (!dolock) { + xfs_dqunlock(dqp); + xfs_dqunlock(udqhint); + } + goto done; + } + /* + * We can't hold a dquot lock when we call the dqget code. + * We'll deadlock in no time, because of (not conforming to) + * lock ordering - the inodelock comes before any dquot lock, + * and we may drop and reacquire the ilock in xfs_qm_dqget(). + */ + if (udqhint) + xfs_dqunlock(udqhint); + /* + * Find the dquot from somewhere. This bumps the + * reference count of dquot and returns it locked. + * This can return ENOENT if dquot didn't exist on + * disk and we didn't ask it to allocate; + * ESRCH if quotas got turned off suddenly. + */ + if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc|XFS_QMOPT_DOWARN, &dqp))) { + if (udqhint && dolock) + xfs_dqlock(udqhint); + goto done; + } + + xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); + /* + * dqget may have dropped and re-acquired the ilock, but it guarantees + * that the dquot returned is the one that should go in the inode. + */ + *IO_idqpp = dqp; + ASSERT(dqp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (! dolock) { + xfs_dqunlock(dqp); + goto done; + } + if (! udqhint) + goto done; + + ASSERT(udqhint); + ASSERT(dolock); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (! xfs_qm_dqlock_nowait(udqhint)) { + xfs_dqunlock(dqp); + xfs_dqlock(udqhint); + xfs_dqlock(dqp); + } + done: +#ifdef QUOTADEBUG + if (udqhint) { + if (dolock) + ASSERT(XFS_DQ_IS_LOCKED(udqhint)); + } + if (! error) { + if (dolock) + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + } +#endif + return (error); +} + + +/* + * Given a udquot and gdquot, attach a ptr to the group dquot in the + * udquot as a hint for future lookups. The idea sounds simple, but the + * execution isn't, because the udquot might have a group dquot attached + * already and getting rid of that gets us into lock ordering contraints. + * The process is complicated more by the fact that the dquots may or may not + * be locked on entry. + */ +STATIC void +xfs_qm_dqattach_grouphint( + xfs_dquot_t *udq, + xfs_dquot_t *gdq, + uint locked) +{ + xfs_dquot_t *tmp; + +#ifdef QUOTADEBUG + if (locked) { + ASSERT(XFS_DQ_IS_LOCKED(udq)); + ASSERT(XFS_DQ_IS_LOCKED(gdq)); + } +#endif + if (! locked) + xfs_dqlock(udq); + + if ((tmp = udq->q_gdquot)) { + if (tmp == gdq) { + if (! locked) + xfs_dqunlock(udq); + return; + } + + udq->q_gdquot = NULL; + /* + * We can't keep any dqlocks when calling dqrele, + * because the freelist lock comes before dqlocks. + */ + xfs_dqunlock(udq); + if (locked) + xfs_dqunlock(gdq); + /* + * we took a hard reference once upon a time in dqget, + * so give it back when the udquot no longer points at it + * dqput() does the unlocking of the dquot. + */ + xfs_qm_dqrele(tmp); + + xfs_dqlock(udq); + xfs_dqlock(gdq); + + } else { + ASSERT(XFS_DQ_IS_LOCKED(udq)); + if (! locked) { + xfs_dqlock(gdq); + } + } + + ASSERT(XFS_DQ_IS_LOCKED(udq)); + ASSERT(XFS_DQ_IS_LOCKED(gdq)); + /* + * Somebody could have attached a gdquot here, + * when we dropped the uqlock. If so, just do nothing. + */ + if (udq->q_gdquot == NULL) { + XFS_DQHOLD(gdq); + udq->q_gdquot = gdq; + } + if (! locked) { + xfs_dqunlock(gdq); + xfs_dqunlock(udq); + } +} + + +/* + * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON + * in to account. + * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty + * much made this code a complete mess, but it has been pretty useful. + * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. + * Inode may get unlocked and relocked in here, and the caller must deal with + * the consequences. + */ +int +xfs_qm_dqattach( + xfs_inode_t *ip, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + uint nquotas = 0; + int error = 0; + + if ((! XFS_IS_QUOTA_ON(mp)) || + (! XFS_NOT_DQATTACHED(mp, ip)) || + (ip->i_ino == mp->m_sb.sb_uquotino) || + (ip->i_ino == mp->m_sb.sb_gquotino)) + return (0); + + ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || + XFS_ISLOCKED_INODE_EXCL(ip)); + + if (! (flags & XFS_QMOPT_ILOCKED)) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, + flags & XFS_QMOPT_DQALLOC, + flags & XFS_QMOPT_DQLOCK, + NULL, &ip->i_udquot); + if (error) + goto done; + nquotas++; + } + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + flags & XFS_QMOPT_DQLOCK, + ip->i_udquot, &ip->i_gdquot); + /* + * Don't worry about the udquot that we may have + * attached above. It'll get detached, if not already. + */ + if (error) + goto done; + nquotas++; + } + + /* + * Attach this group quota to the user quota as a hint. + * This WON'T, in general, result in a thrash. + */ + if (nquotas == 2) { + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(ip->i_udquot); + ASSERT(ip->i_gdquot); + + /* + * We may or may not have the i_udquot locked at this point, + * but this check is OK since we don't depend on the i_gdquot to + * be accurate 100% all the time. It is just a hint, and this + * will succeed in general. + */ + if (ip->i_udquot->q_gdquot == ip->i_gdquot) + goto done; + /* + * Attach i_gdquot to the gdquot hint inside the i_udquot. + */ + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, + flags & XFS_QMOPT_DQLOCK); + } + + done: + +#ifdef QUOTADEBUG + if (! error) { + if (ip->i_udquot) { + if (flags & XFS_QMOPT_DQLOCK) + ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot)); + } + if (ip->i_gdquot) { + if (flags & XFS_QMOPT_DQLOCK) + ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot)); + } + if (XFS_IS_UQUOTA_ON(mp)) + ASSERT(ip->i_udquot); + if (XFS_IS_GQUOTA_ON(mp)) + ASSERT(ip->i_gdquot); + } +#endif + + if (! (flags & XFS_QMOPT_ILOCKED)) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + +#ifdef QUOTADEBUG + else + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); +#endif + return (error); +} + +/* + * Release dquots (and their references) if any. + * The inode should be locked EXCL except when this's called by + * xfs_ireclaim. + */ +void +xfs_qm_dqdetach( + xfs_inode_t *ip) +{ + if (!(ip->i_udquot || ip->i_gdquot)) + return; + + ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); + ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); + if (ip->i_udquot) + xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip); + if (ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } +} + +/* + * This is called by VFS_SYNC and flags arg determines the caller, + * and its motives, as done in xfs_sync. + * + * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31 + * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25 + * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA + */ + +int +xfs_qm_sync( + xfs_mount_t *mp, + short flags) +{ + int recl, restarts; + xfs_dquot_t *dqp; + uint flush_flags; + boolean_t nowait; + int error; + + restarts = 0; + /* + * We won't block unless we are asked to. + */ + nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0); + + again: + xfs_qm_mplist_lock(mp); + /* + * dqpurge_all() also takes the mplist lock and iterate thru all dquots + * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared + * when we have the mplist lock, we know that dquots will be consistent + * as long as we have it locked. + */ + if (! XFS_IS_QUOTA_ON(mp)) { + xfs_qm_mplist_unlock(mp); + return (0); + } + FOREACH_DQUOT_IN_MP(dqp, mp) { + /* + * If this is vfs_sync calling, then skip the dquots that + * don't 'seem' to be dirty. ie. don't acquire dqlock. + * This is very similar to what xfs_sync does with inodes. + */ + if (flags & SYNC_BDFLUSH) { + if (! XFS_DQ_IS_DIRTY(dqp)) + continue; + } + + if (nowait) { + /* + * Try to acquire the dquot lock. We are NOT out of + * lock order, but we just don't want to wait for this + * lock, unless somebody wanted us to. + */ + if (! xfs_qm_dqlock_nowait(dqp)) + continue; + } else { + xfs_dqlock(dqp); + } + + /* + * Now, find out for sure if this dquot is dirty or not. + */ + if (! XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqunlock(dqp); + continue; + } + + /* XXX a sentinel would be better */ + recl = XFS_QI_MPLRECLAIMS(mp); + if (! xfs_qm_dqflock_nowait(dqp)) { + if (nowait) { + xfs_dqunlock(dqp); + continue; + } + /* + * If we can't grab the flush lock then if the caller + * really wanted us to give this our best shot, + * see if we can give a push to the buffer before we wait + * on the flush lock. At this point, we know that + * eventhough the dquot is being flushed, + * it has (new) dirty data. + */ + xfs_qm_dqflock_pushbuf_wait(dqp); + } + /* + * Let go of the mplist lock. We don't want to hold it + * across a disk write + */ + flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC; + xfs_qm_mplist_unlock(mp); + xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH"); + error = xfs_qm_dqflush(dqp, flush_flags); + xfs_dqunlock(dqp); + if (error && XFS_FORCED_SHUTDOWN(mp)) + return(0); /* Need to prevent umount failure */ + else if (error) + return (error); + + xfs_qm_mplist_lock(mp); + if (recl != XFS_QI_MPLRECLAIMS(mp)) { + if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) + break; + + xfs_qm_mplist_unlock(mp); + goto again; + } + } + + xfs_qm_mplist_unlock(mp); + return (0); +} + + +/* + * This initializes all the quota information that's kept in the + * mount structure + */ +int +xfs_qm_init_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qinf; + int error; + xfs_dquot_t *dqp; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Tell XQM that we exist as soon as possible. + */ + if ((error = xfs_qm_hold_quotafs_ref(mp))) { + return (error); + } + + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + + /* + * See if quotainodes are setup, and if not, allocate them, + * and change the superblock accordingly. + */ + if ((error = xfs_qm_init_quotainos(mp))) { + kmem_free(qinf, sizeof(xfs_quotainfo_t)); + mp->m_quotainfo = NULL; + return (error); + } + + spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin"); + xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); + qinf->qi_dqreclaims = 0; + + /* mutex used to serialize quotaoffs */ + mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff"); + + /* Precalc some constants */ + qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(qinf->qi_dqchunklen); + qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); + do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + + mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * We look at the USR dquot with id == 0 first, but if user quotas + * are not enabled we goto the GRP dquot with id == 0. + * We don't really care to keep separate default limits for user + * and group quotas, at least not at this point. + */ + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, + (XFS_IS_UQUOTA_RUNNING(mp)) ? + XFS_DQ_USER : XFS_DQ_GROUP, + XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, + &dqp); + if (! error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + qinf->qi_btimelimit = + INT_GET(ddqp->d_btimer, ARCH_CONVERT) ? + INT_GET(ddqp->d_btimer, ARCH_CONVERT) : + XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = + INT_GET(ddqp->d_itimer, ARCH_CONVERT) ? + INT_GET(ddqp->d_itimer, ARCH_CONVERT) : + XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = + INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) ? + INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) : + XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = + INT_GET(ddqp->d_bwarns, ARCH_CONVERT) ? + INT_GET(ddqp->d_bwarns, ARCH_CONVERT) : + XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = + INT_GET(ddqp->d_iwarns, ARCH_CONVERT) ? + INT_GET(ddqp->d_iwarns, ARCH_CONVERT) : + XFS_QM_IWARNLIMIT; + qinf->qi_bhardlimit = + INT_GET(ddqp->d_blk_hardlimit, ARCH_CONVERT); + qinf->qi_bsoftlimit = + INT_GET(ddqp->d_blk_softlimit, ARCH_CONVERT); + qinf->qi_ihardlimit = + INT_GET(ddqp->d_ino_hardlimit, ARCH_CONVERT); + qinf->qi_isoftlimit = + INT_GET(ddqp->d_ino_softlimit, ARCH_CONVERT); + qinf->qi_rtbhardlimit = + INT_GET(ddqp->d_rtb_hardlimit, ARCH_CONVERT); + qinf->qi_rtbsoftlimit = + INT_GET(ddqp->d_rtb_softlimit, ARCH_CONVERT); + + /* + * We sent the XFS_QMOPT_DQSUSER flag to dqget because + * we don't want this dquot cached. We haven't done a + * quotacheck yet, and quotacheck doesn't like incore dquots. + */ + xfs_qm_dqdestroy(dqp); + } else { + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + } + + return (0); +} + + +/* + * Gets called when unmounting a filesystem or when all quotas get + * turned off. + * This purges the quota inodes, destroys locks and frees itself. + */ +void +xfs_qm_destroy_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qi; + + qi = mp->m_quotainfo; + ASSERT(qi != NULL); + ASSERT(xfs_Gqm != NULL); + + /* + * Release the reference that XQM kept, so that we know + * when the XQM structure should be freed. We cannot assume + * that xfs_Gqm is non-null after this point. + */ + xfs_qm_rele_quotafs_ref(mp); + + spinlock_destroy(&qi->qi_pinlock); + xfs_qm_list_destroy(&qi->qi_dqlist); + + if (qi->qi_uquotaip) { + XFS_PURGE_INODE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + XFS_PURGE_INODE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + mutex_destroy(&qi->qi_quotaofflock); + kmem_free(qi, sizeof(xfs_quotainfo_t)); + mp->m_quotainfo = NULL; +} + + + +/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */ + +/* ARGSUSED */ +STATIC void +xfs_qm_list_init( + xfs_dqlist_t *list, + char *str, + int n) +{ + mutex_init(&list->qh_lock, MUTEX_DEFAULT, str); + list->qh_next = NULL; + list->qh_version = 0; + list->qh_nelems = 0; +} + +STATIC void +xfs_qm_list_destroy( + xfs_dqlist_t *list) +{ + mutex_destroy(&(list->qh_lock)); +} + + +/* + * Stripped down version of dqattach. This doesn't attach, or even look at the + * dquots attached to the inode. The rationale is that there won't be any + * attached at the time this is called from quotacheck. + */ +STATIC int +xfs_qm_dqget_noattach( + xfs_inode_t *ip, + xfs_dquot_t **O_udqpp, + xfs_dquot_t **O_gdqpp) +{ + int error; + xfs_mount_t *mp; + xfs_dquot_t *udqp, *gdqp; + + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + mp = ip->i_mount; + udqp = NULL; + gdqp = NULL; + + if (XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ip->i_udquot == NULL); + /* + * We want the dquot allocated if it doesn't exist. + */ + if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, + &udqp))) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != ESRCH); + ASSERT(error != ENOENT); + return (error); + } + ASSERT(udqp); + } + + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(ip->i_gdquot == NULL); + if (udqp) + xfs_dqunlock(udqp); + if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, + &gdqp))) { + if (udqp) + xfs_qm_dqrele(udqp); + ASSERT(error != ESRCH); + ASSERT(error != ENOENT); + return (error); + } + ASSERT(gdqp); + + /* Reacquire the locks in the right order */ + if (udqp) { + if (! xfs_qm_dqlock_nowait(udqp)) { + xfs_dqunlock(gdqp); + xfs_dqlock(udqp); + xfs_dqlock(gdqp); + } + } + } + + *O_udqpp = udqp; + *O_gdqpp = gdqp; + +#ifdef QUOTADEBUG + if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp)); + if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp)); +#endif + return (0); +} + +/* + * Create an inode and return with a reference already taken, but unlocked + * This is how we create quota inodes + */ +STATIC int +xfs_qm_qino_alloc( + xfs_mount_t *mp, + xfs_inode_t **ip, + __int64_t sbfields, + uint flags) +{ + xfs_trans_t *tp; + int error; + unsigned long s; + cred_t zerocr; + int committed; + + tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE); + if ((error = xfs_trans_reserve(tp, + XFS_QM_QINOCREATE_SPACE_RES(mp), + XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_CREATE_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + memset(&zerocr, 0, sizeof(zerocr)); + + if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0, + &zerocr, 0, 1, ip, &committed))) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return (error); + } + + /* + * Keep an extra reference to this quota inode. This inode is + * locked exclusively and joined to the transaction already. + */ + ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip)); + VN_HOLD(XFS_ITOV((*ip))); + + /* + * Make the changes in the superblock, and log those too. + * sbfields arg may contain fields other than *QUOTINO; + * VERSIONNUM for example. + */ + s = XFS_SB_LOCK(mp); + if (flags & XFS_QMOPT_SBVERSION) { +#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) + unsigned oldv = mp->m_sb.sb_versionnum; +#endif + ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb)); + ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == + (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); + + XFS_SB_VERSION_ADDQUOTA(&mp->m_sb); + mp->m_sb.sb_uquotino = NULLFSINO; + mp->m_sb.sb_gquotino = NULLFSINO; + + /* qflags will get updated _after_ quotacheck */ + mp->m_sb.sb_qflags = 0; +#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) + cmn_err(CE_NOTE, + "Old superblock version %x, converting to %x.", + oldv, mp->m_sb.sb_versionnum); +#endif + } + if (flags & XFS_QMOPT_UQUOTA) + mp->m_sb.sb_uquotino = (*ip)->i_ino; + else + mp->m_sb.sb_gquotino = (*ip)->i_ino; + XFS_SB_UNLOCK(mp, s); + xfs_mod_sb(tp, sbfields); + + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, + NULL))) { + xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); + return (error); + } + return (0); +} + + +STATIC int +xfs_qm_reset_dqcounts( + xfs_mount_t *mp, + xfs_buf_t *bp, + xfs_dqid_t id, + uint type) +{ + xfs_disk_dquot_t *ddq; + int j; + + xfs_buftrace("RESET DQUOTS", bp); + /* + * Reset all counters and timers. They'll be + * started afresh by xfs_qm_quotacheck. + */ +#ifdef DEBUG + j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + do_div(j, sizeof(xfs_dqblk_t)); + ASSERT(XFS_QM_DQPERBLK(mp) == j); +#endif + ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); + for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { + /* + * Do a sanity check, and if needed, repair the dqblk. Don't + * output any warnings because it's perfectly possible to + * find unitialized dquot blks. See comment in xfs_qm_dqcheck. + */ + (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); + INT_SET(ddq->d_bcount, ARCH_CONVERT, 0ULL); + INT_SET(ddq->d_icount, ARCH_CONVERT, 0ULL); + INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL); + INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0); + INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0); + INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL); + INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL); + ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + } + + return (0); +} + +STATIC int +xfs_qm_dqiter_bufs( + xfs_mount_t *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags) +{ + xfs_buf_t *bp; + int error; + int notcommitted; + int incr; + + ASSERT(blkcnt > 0); + notcommitted = 0; + incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ? + XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt; + error = 0; + + /* + * Blkcnt arg can be a very big number, and might even be + * larger than the log itself. So, we have to break it up into + * manageable-sized transactions. + * Note that we don't start a permanent transaction here; we might + * not be able to get a log reservation for the whole thing up front, + * and we don't really care to either, because we just discard + * everything if we were to crash in the middle of this loop. + */ + while (blkcnt--) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); + if (error) + break; + + (void) xfs_qm_reset_dqcounts(mp, bp, firstid, + flags & XFS_QMOPT_UQUOTA ? + XFS_DQ_USER : XFS_DQ_GROUP); + xfs_bdwrite(mp, bp); + /* + * goto the next block. + */ + bno++; + firstid += XFS_QM_DQPERBLK(mp); + } + return (error); +} + +/* + * Iterate over all allocated USR/GRP dquots in the system, calling a + * caller supplied function for every chunk of dquots that we find. + */ +STATIC int +xfs_qm_dqiterate( + xfs_mount_t *mp, + xfs_inode_t *qip, + uint flags) +{ + xfs_bmbt_irec_t *map; + int i, nmaps; /* number of map entries */ + int error; /* return value */ + xfs_fileoff_t lblkno; + xfs_filblks_t maxlblkcnt; + xfs_dqid_t firstid; + xfs_fsblock_t rablkno; + xfs_filblks_t rablkcnt; + + error = 0; + /* + * This looks racey, but we can't keep an inode lock across a + * trans_reserve. But, this gets called during quotacheck, and that + * happens only at mount time which is single threaded. + */ + if (qip->i_d.di_nblocks == 0) + return (0); + + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + + lblkno = 0; + maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + do { + nmaps = XFS_DQITER_MAP_SIZE; + /* + * We aren't changing the inode itself. Just changing + * some of its data. No new blocks are added here, and + * the inode is never added to the transaction. + */ + xfs_ilock(qip, XFS_ILOCK_SHARED); + error = xfs_bmapi(NULL, qip, lblkno, + maxlblkcnt - lblkno, + XFS_BMAPI_METADATA, + NULL, + 0, map, &nmaps, NULL); + xfs_iunlock(qip, XFS_ILOCK_SHARED); + if (error) + break; + + ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); + for (i = 0; i < nmaps; i++) { + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + ASSERT(map[i].br_blockcount); + + + lblkno += map[i].br_blockcount; + + if (map[i].br_startblock == HOLESTARTBLOCK) + continue; + + firstid = (xfs_dqid_t) map[i].br_startoff * + XFS_QM_DQPERBLK(mp); + /* + * Do a read-ahead on the next extent. + */ + if ((i+1 < nmaps) && + (map[i+1].br_startblock != HOLESTARTBLOCK)) { + rablkcnt = map[i+1].br_blockcount; + rablkno = map[i+1].br_startblock; + while (rablkcnt--) { + xfs_baread(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + (int)XFS_QI_DQCHUNKLEN(mp)); + rablkno++; + } + } + /* + * Iterate thru all the blks in the extent and + * reset the counters of all the dquots inside them. + */ + if ((error = xfs_qm_dqiter_bufs(mp, + firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags))) { + break; + } + } + + if (error) + break; + } while (nmaps > 0); + + kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map)); + + return (error); +} + +/* + * Called by dqusage_adjust in doing a quotacheck. + * Given the inode, and a dquot (either USR or GRP, doesn't matter), + * this updates its incore copy as well as the buffer copy. This is + * so that once the quotacheck is done, we can just log all the buffers, + * as opposed to logging numerous updates to individual dquots. + */ +STATIC void +xfs_qm_quotacheck_dqadjust( + xfs_dquot_t *dqp, + xfs_qcnt_t nblks, + xfs_qcnt_t rtblks) +{ + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + xfs_dqtrace_entry(dqp, "QCHECK DQADJUST"); + /* + * Adjust the inode count and the block count to reflect this inode's + * resource usage. + */ + INT_MOD(dqp->q_core.d_icount, ARCH_CONVERT, +1); + dqp->q_res_icount++; + if (nblks) { + INT_MOD(dqp->q_core.d_bcount, ARCH_CONVERT, nblks); + dqp->q_res_bcount += nblks; + } + if (rtblks) { + INT_MOD(dqp->q_core.d_rtbcount, ARCH_CONVERT, rtblks); + dqp->q_res_rtbcount += rtblks; + } + + /* + * Set default limits, adjust timers (since we changed usages) + */ + if (! XFS_IS_SUSER_DQUOT(dqp)) { + xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); + xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; +} + +STATIC int +xfs_qm_get_rtblks( + xfs_inode_t *ip, + xfs_qcnt_t *O_rtblks) +{ + xfs_filblks_t rtblks; /* total rt blks */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ + int error; + + ASSERT(XFS_IS_REALTIME_INODE(ip)); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + return (error); + } + rtblks = 0; + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + for (ep = base; ep < &base[nextents]; ep++) + rtblks += xfs_bmbt_get_blockcount(ep); + *O_rtblks = (xfs_qcnt_t)rtblks; + return (0); +} + +/* + * callback routine supplied to bulkstat(). Given an inumber, find its + * dquots and update them to account for resources taken by that inode. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqusage_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + void *private_data, /* not used */ + xfs_daddr_t bno, /* starting block of inode cluster */ + int *ubused, /* not used */ + void *dip, /* on-disk inode pointer (not used) */ + int *res) /* result code value */ +{ + xfs_inode_t *ip; + xfs_dquot_t *udqp, *gdqp; + xfs_qcnt_t nblks, rtblks; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * rootino must have its resources accounted for, not so with the quota + * inodes. + */ + if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + *res = BULKSTAT_RV_NOTHING; + return XFS_ERROR(EINVAL); + } + + /* + * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget + * interface expects the inode to be exclusively locked because that's + * the case in all other instances. It's OK that we do this because + * quotacheck is done only at mount time. + */ + if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { + *res = BULKSTAT_RV_NOTHING; + return (error); + } + + if (ip->i_d.di_mode == 0) { + xfs_iput_new(ip, XFS_ILOCK_EXCL); + *res = BULKSTAT_RV_NOTHING; + return XFS_ERROR(ENOENT); + } + + /* + * Obtain the locked dquots. In case of an error (eg. allocation + * fails for ENOSPC), we return the negative of the error number + * to bulkstat, so that it can get propagated to quotacheck() and + * making us disable quotas for the file system. + */ + if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { + xfs_iput(ip, XFS_ILOCK_EXCL); + *res = BULKSTAT_RV_GIVEUP; + return (error); + } + + rtblks = 0; + if (! XFS_IS_REALTIME_INODE(ip)) { + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks; + } else { + /* + * Walk thru the extent list and count the realtime blocks. + */ + if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { + xfs_iput(ip, XFS_ILOCK_EXCL); + if (udqp) + xfs_qm_dqput(udqp); + if (gdqp) + xfs_qm_dqput(gdqp); + *res = BULKSTAT_RV_GIVEUP; + return (error); + } + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + } + ASSERT(ip->i_delayed_blks == 0); + + /* + * We can't release the inode while holding its dquot locks. + * The inode can go into inactive and might try to acquire the dquotlocks. + * So, just unlock here and do a vn_rele at the end. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * Add the (disk blocks and inode) resources occupied by this + * inode to its dquots. We do this adjustment in the incore dquot, + * and also copy the changes to its buffer. + * We don't care about putting these changes in a transaction + * envelope because if we crash in the middle of a 'quotacheck' + * we have to start from the beginning anyway. + * Once we're done, we'll log all the dquot bufs. + * + * The *QUOTA_ON checks below may look pretty racey, but quotachecks + * and quotaoffs don't race. (Quotachecks happen at mount time only). + */ + if (XFS_IS_UQUOTA_ON(mp)) { + ASSERT(udqp); + xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); + xfs_qm_dqput(udqp); + } + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(gdqp); + xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); + xfs_qm_dqput(gdqp); + } + /* + * Now release the inode. This will send it to 'inactive', and + * possibly even free blocks. + */ + VN_RELE(XFS_ITOV(ip)); + + /* + * Goto next inode. + */ + *res = BULKSTAT_RV_DIDONE; + return (0); +} + +/* + * Walk thru all the filesystem inodes and construct a consistent view + * of the disk quota world. If the quotacheck fails, disable quotas. + */ +int +xfs_qm_quotacheck( + xfs_mount_t *mp) +{ + int done, count, error; + xfs_ino_t lastino; + size_t structsz; + xfs_inode_t *uip, *gip; + uint flags; + + count = INT_MAX; + structsz = 1; + lastino = 0; + flags = 0; + + ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * There should be no cached dquots. The (simplistic) quotacheck + * algorithm doesn't like that. + */ + ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); + + cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); + + /* + * First we go thru all the dquots on disk, USR and GRP, and reset + * their counters to zero. We need a clean slate. + * We don't log our changes till later. + */ + if ((uip = XFS_QI_UQIP(mp))) { + if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) + goto error_return; + flags |= XFS_UQUOTA_CHKD; + } + + if ((gip = XFS_QI_GQIP(mp))) { + if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA))) + goto error_return; + flags |= XFS_GQUOTA_CHKD; + } + + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters in core. + */ + if ((error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, NULL, + structsz, NULL, + BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED, + &done))) + break; + + } while (! done); + + /* + * We can get this error if we couldn't do a dquot allocation inside + * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the + * dirty dquots that might be cached, we just want to get rid of them + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ + if (error) { + xfs_qm_dqpurge_all(mp, + XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA| + XFS_QMOPT_QUOTAOFF); + goto error_return; + } + /* + * We've made all the changes that we need to make incore. + * Now flush_them down to disk buffers. + */ + xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); + + /* + * We didn't log anything, because if we crashed, we'll have to + * start the quotacheck from scratch anyway. However, we must make + * sure that our dquot changes are secure before we put the + * quotacheck'd stamp on the superblock. So, here we do a synchronous + * flush. + */ + XFS_bflush(mp->m_ddev_targp); + + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD); + mp->m_qflags |= flags; + + XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); + + error_return: + if (error) { + cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " + "Disabling quotas.", + mp->m_fsname, error); + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo != NULL); + ASSERT(xfs_Gqm != NULL); + xfs_qm_destroy_quotainfo(mp); + xfs_mount_reset_sbqflags(mp); + } else { + cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); + } + return (error); +} + +/* + * This is called after the superblock has been read in and we're ready to + * iget the quota inodes. + */ +STATIC int +xfs_qm_init_quotainos( + xfs_mount_t *mp) +{ + xfs_inode_t *uip, *gip; + int error; + __int64_t sbflags; + uint flags; + + ASSERT(mp->m_quotainfo); + uip = gip = NULL; + sbflags = 0; + flags = 0; + + /* + * Get the uquota and gquota inodes + */ + if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { + if (XFS_IS_UQUOTA_ON(mp) && + mp->m_sb.sb_uquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_uquotino > 0); + if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip, 0))) + return XFS_ERROR(error); + } + if (XFS_IS_GQUOTA_ON(mp) && + mp->m_sb.sb_gquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_gquotino > 0); + if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip, 0))) { + if (uip) + VN_RELE(XFS_ITOV(uip)); + return XFS_ERROR(error); + } + } + } else { + flags |= XFS_QMOPT_SBVERSION; + sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_QFLAGS); + } + + /* + * Create the two inodes, if they don't exist already. The changes + * made above will get added to a transaction and logged in one of + * the qino_alloc calls below. If the device is readonly, + * temporarily switch to read-write to do this. + */ + if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { + if ((error = xfs_qm_qino_alloc(mp, &uip, + sbflags | XFS_SB_UQUOTINO, + flags | XFS_QMOPT_UQUOTA))) + return XFS_ERROR(error); + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { + if ((error = xfs_qm_qino_alloc(mp, &gip, + sbflags | XFS_SB_GQUOTINO, + flags | XFS_QMOPT_GQUOTA))) { + if (uip) + VN_RELE(XFS_ITOV(uip)); + + return XFS_ERROR(error); + } + } + + XFS_QI_UQIP(mp) = uip; + XFS_QI_GQIP(mp) = gip; + + return (0); +} + + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + * XXXsup merge this with qm_reclaim_one(). + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed; + xfs_dqhash_t *hash; + xfs_dquot_t *dqp, *nextdqp; + int restarts; + int nflushes; + + if (howmany <= 0) + return (0); + + nreclaimed = 0; + restarts = 0; + nflushes = 0; + +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); +#endif + /* lock order is : hashchainlock, freelistlock, mplistlock */ + tryagain: + xfs_qm_freelist_lock(xfs_Gqm); + + for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; + ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && + nreclaimed < howmany); ) { + xfs_dqlock(dqp); + + /* + * We are racing with dqlookup here. Naturally we don't + * want to reclaim a dquot that lookup wants. + */ + if (dqp->dq_flags & XFS_DQ_WANT) { + xfs_dqunlock(dqp); + xfs_qm_freelist_unlock(xfs_Gqm); + if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return (nreclaimed); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); + goto tryagain; + } + + /* + * If the dquot is inactive, we are assured that it is + * not on the mplist or the hashlist, and that makes our + * life easier. + */ + if (dqp->dq_flags & XFS_DQ_INACTIVE) { + ASSERT(dqp->q_mount == NULL); + ASSERT(! XFS_DQ_IS_DIRTY(dqp)); + ASSERT(dqp->HL_PREVP == NULL); + ASSERT(dqp->MPL_PREVP == NULL); + XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); + nextdqp = dqp->dq_flnext; + goto off_freelist; + } + + ASSERT(dqp->MPL_PREVP); + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (! xfs_qm_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + dqp = dqp->dq_flnext; + continue; + } + + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); + /* + * We flush it delayed write, so don't bother + * releasing the mplock. + */ + (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ + dqp = dqp->dq_flnext; + continue; + } + /* + * We're trying to get the hashlock out of order. This races + * with dqlookup; so, we giveup and goto the next dquot if + * we couldn't get the hashlock. This way, we won't starve + * a dqlookup process that holds the hashlock that is + * waiting for the freelist lock. + */ + if (! xfs_qm_dqhashlock_nowait(dqp)) { + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + dqp = dqp->dq_flnext; + continue; + } + /* + * This races with dquot allocation code as well as dqflush_all + * and reclaim code. So, if we failed to grab the mplist lock, + * giveup everything and start over. + */ + hash = dqp->q_hash; + ASSERT(hash); + if (! xfs_qm_mplist_nowait(dqp->q_mount)) { + /* XXX put a sentinel so that we can come back here */ + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + XFS_DQ_HASH_UNLOCK(hash); + xfs_qm_freelist_unlock(xfs_Gqm); + if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return (nreclaimed); + goto tryagain; + } + xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING"); +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", + dqp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT)); +#endif + ASSERT(dqp->q_nrefs == 0); + nextdqp = dqp->dq_flnext; + XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); + XQM_HASHLIST_REMOVE(hash, dqp); + xfs_dqfunlock(dqp); + xfs_qm_mplist_unlock(dqp->q_mount); + XFS_DQ_HASH_UNLOCK(hash); + + off_freelist: + XQM_FREELIST_REMOVE(dqp); + xfs_dqunlock(dqp); + nreclaimed++; + XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); + xfs_qm_dqdestroy(dqp); + dqp = nextdqp; + } + xfs_qm_freelist_unlock(xfs_Gqm); + return (nreclaimed); +} + + +/* + * The kmem_shake interface is invoked when memory is running low. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_shake(int nr_to_scan, unsigned int gfp_mask) +{ + int ndqused, nfree, n; + + if (!kmem_shake_allow(gfp_mask)) + return (0); + if (!xfs_Gqm) + return (0); + + nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ + /* incore dquots in all f/s's */ + ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; + + ASSERT(ndqused >= 0); + + if (nfree <= ndqused && nfree < ndquot) + return (0); + + ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ + n = nfree - ndqused - ndquot; /* # over target */ + + return xfs_qm_shake_freelist(MAX(nfree, n)); +} + + +/* + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. + */ +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) +{ + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; + int restarts; + int nflushes; + + restarts = 0; + dqpout = NULL; + nflushes = 0; + + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ + startagain: + xfs_qm_freelist_lock(xfs_Gqm); + + FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { + xfs_dqlock(dqp); + + /* + * We are racing with dqlookup here. Naturally we don't + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. + */ + if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT"); + xfs_dqunlock(dqp); + xfs_qm_freelist_unlock(xfs_Gqm); + if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return (NULL); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); + goto startagain; + } + + /* + * If the dquot is inactive, we are assured that it is + * not on the mplist or the hashlist, and that makes our + * life easier. + */ + if (dqp->dq_flags & XFS_DQ_INACTIVE) { + ASSERT(dqp->q_mount == NULL); + ASSERT(! XFS_DQ_IS_DIRTY(dqp)); + ASSERT(dqp->HL_PREVP == NULL); + ASSERT(dqp->MPL_PREVP == NULL); + XQM_FREELIST_REMOVE(dqp); + xfs_dqunlock(dqp); + dqpout = dqp; + XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); + break; + } + + ASSERT(dqp->q_hash); + ASSERT(dqp->MPL_PREVP); + + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (! xfs_qm_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + continue; + } + + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); + /* + * We flush it delayed write, so don't bother + * releasing the freelist lock. + */ + (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); + xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ + continue; + } + + if (! xfs_qm_mplist_nowait(dqp->q_mount)) { + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + continue; + } + + if (! xfs_qm_dqhashlock_nowait(dqp)) + goto mplistunlock; + + ASSERT(dqp->q_nrefs == 0); + xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING"); + XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); + XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); + XQM_FREELIST_REMOVE(dqp); + dqpout = dqp; + XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mplistunlock: + xfs_qm_mplist_unlock(dqp->q_mount); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + if (dqpout) + break; + } + + xfs_qm_freelist_unlock(xfs_Gqm); + return (dqpout); +} + + +/*------------------------------------------------------------------*/ + +/* + * Return a new incore dquot. Depending on the number of + * dquots in the system, we either allocate a new one on the kernel heap, + * or reclaim a free one. + * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed + * to reclaim an existing one from the freelist. + */ +boolean_t +xfs_qm_dqalloc_incore( + xfs_dquot_t **O_dqpp) +{ + xfs_dquot_t *dqp; + + /* + * Check against high water mark to see if we want to pop + * a nincompoop dquot off the freelist. + */ + if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { + /* + * Try to recycle a dquot from the freelist. + */ + if ((dqp = xfs_qm_dqreclaim_one())) { + XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + /* + * Just zero the core here. The rest will get + * reinitialized by caller. XXX we shouldn't even + * do this zero ... + */ + memset(&dqp->q_core, 0, sizeof(dqp->q_core)); + *O_dqpp = dqp; + return (B_FALSE); + } + XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + } + + /* + * Allocate a brand new dquot on the kernel heap and return it + * to the caller to initialize. + */ + ASSERT(xfs_Gqm->qm_dqzone != NULL); + *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + atomic_inc(&xfs_Gqm->qm_totaldquots); + + return (B_TRUE); +} + + +/* + * Start a transaction and write the incore superblock changes to + * disk. flags parameter indicates which fields have changed. + */ +int +xfs_qm_write_sb_changes( + xfs_mount_t *mp, + __int64_t flags) +{ + xfs_trans_t *tp; + int error; + +#ifdef QUOTADEBUG + cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); +#endif + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + if ((error = xfs_trans_reserve(tp, 0, + mp->m_sb.sb_sectsize + 128, 0, + 0, + XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + + xfs_mod_sb(tp, flags); + (void) xfs_trans_commit(tp, 0, NULL); + + return (0); +} + + +/* --------------- utility functions for vnodeops ---------------- */ + + +/* + * Given an inode, a uid and gid (from cred_t) make sure that we have + * allocated relevant dquot(s) on disk, and that we won't exceed inode + * quotas by creating this file. + * This also attaches dquot(s) to the given inode after locking it, + * and returns the dquots corresponding to the uid and/or gid. + * + * in : inode (unlocked) + * out : udquot, gdquot with references taken and unlocked + */ +int +xfs_qm_vop_dqalloc( + xfs_mount_t *mp, + xfs_inode_t *ip, + uid_t uid, + gid_t gid, + uint flags, + xfs_dquot_t **O_udqpp, + xfs_dquot_t **O_gdqpp) +{ + int error; + xfs_dquot_t *uq, *gq; + uint lockflags; + + if (!XFS_IS_QUOTA_ON(mp)) + return 0; + + lockflags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockflags); + + if ((flags & XFS_QMOPT_INHERIT) && + XFS_INHERIT_GID(ip, XFS_MTOVFS(mp))) + gid = ip->i_d.di_gid; + + /* + * Attach the dquot(s) to this inode, doing a dquot allocation + * if necessary. The dquot(s) will not be locked. + */ + if (XFS_NOT_DQATTACHED(mp, ip)) { + if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC | + XFS_QMOPT_ILOCKED))) { + xfs_iunlock(ip, lockflags); + return (error); + } + } + + uq = gq = NULL; + if ((flags & XFS_QMOPT_UQUOTA) && + XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_d.di_uid != uid) { + /* + * What we need is the dquot that has this uid, and + * if we send the inode to dqget, the uid of the inode + * takes priority over what's sent in the uid argument. + * We must unlock inode here before calling dqget if + * we're not sending the inode, because otherwise + * we'll deadlock by doing trans_reserve while + * holding ilock. + */ + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + XFS_DQ_USER, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &uq))) { + ASSERT(error != ENOENT); + return (error); + } + /* + * Get the ilock in the right order. + */ + xfs_dqunlock(uq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + /* + * Take an extra reference, because we'll return + * this to caller + */ + ASSERT(ip->i_udquot); + uq = ip->i_udquot; + xfs_dqlock(uq); + XFS_DQHOLD(uq); + xfs_dqunlock(uq); + } + } + if ((flags & XFS_QMOPT_GQUOTA) && + XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_d.di_gid != gid) { + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq))) { + if (uq) + xfs_qm_dqrele(uq); + ASSERT(error != ENOENT); + return (error); + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = ip->i_gdquot; + xfs_dqlock(gq); + XFS_DQHOLD(gq); + xfs_dqunlock(gq); + } + } + if (uq) + xfs_dqtrace_entry_ino(uq, "DQALLOC", ip); + + xfs_iunlock(ip, lockflags); + if (O_udqpp) + *O_udqpp = uq; + else if (uq) + xfs_qm_dqrele(uq); + if (O_gdqpp) + *O_gdqpp = gq; + else if (gq) + xfs_qm_dqrele(gq); + return (0); +} + +/* + * Actually transfer ownership, and do dquot modifications. + * These were already reserved. + */ +xfs_dquot_t * +xfs_qm_vop_chown( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t **IO_olddq, + xfs_dquot_t *newdq) +{ + xfs_dquot_t *prevdq; + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + + /* old dquot */ + prevdq = *IO_olddq; + ASSERT(prevdq); + ASSERT(prevdq != newdq); + + xfs_trans_mod_dquot(tp, prevdq, + XFS_TRANS_DQ_BCOUNT, + -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, + XFS_TRANS_DQ_ICOUNT, + -1); + + /* the sparkling new dquot */ + xfs_trans_mod_dquot(tp, newdq, + XFS_TRANS_DQ_BCOUNT, + ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, + XFS_TRANS_DQ_ICOUNT, + 1); + + /* + * Take an extra reference, because the inode + * is going to keep this dquot pointer even + * after the trans_commit. + */ + xfs_dqlock(newdq); + XFS_DQHOLD(newdq); + xfs_dqunlock(newdq); + *IO_olddq = newdq; + + return (prevdq); +} + +/* + * Quota reservations for setattr(AT_UID|AT_GID). + */ +int +xfs_qm_vop_chown_reserve( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t *udqp, + xfs_dquot_t *gdqp, + uint flags) +{ + int error; + xfs_mount_t *mp; + uint delblks; + xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; + + ASSERT(XFS_ISLOCKED_INODE(ip)); + mp = ip->i_mount; + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + delblks = ip->i_delayed_blks; + delblksudq = delblksgdq = unresudq = unresgdq = NULL; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && + ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) { + delblksudq = udqp; + /* + * If there are delayed allocation blocks, then we have to + * unreserve those from the old dquot, and add them to the + * new dquot. + */ + if (delblks) { + ASSERT(ip->i_udquot); + unresudq = ip->i_udquot; + } + } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && + ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) { + delblksgdq = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + unresgdq = ip->i_gdquot; + } + } + + if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, + flags | XFS_QMOPT_RES_REGBLKS))) + return (error); + + /* + * Do the delayed blks reservations/unreservations now. Since, these + * are done without the help of a transaction, if a reservation fails + * its previous reservations won't be automatically undone by trans + * code. So, we have to do it manually here. + */ + if (delblks) { + /* + * Do the reservations first. Unreservation can't fail. + */ + ASSERT(delblksudq || delblksgdq); + ASSERT(unresudq || unresgdq); + if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, + flags | XFS_QMOPT_RES_REGBLKS))) + return (error); + xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, + XFS_QMOPT_RES_REGBLKS); + } + + return (0); +} + +int +xfs_qm_vop_rename_dqattach( + xfs_inode_t **i_tab) +{ + xfs_inode_t *ip; + int i; + int error; + + ip = i_tab[0]; + + if (! XFS_IS_QUOTA_ON(ip->i_mount)) + return (0); + + if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return (error); + } + for (i = 1; (i < 4 && i_tab[i]); i++) { + /* + * Watch out for duplicate entries in the table. + */ + if ((ip = i_tab[i]) != i_tab[i-1]) { + if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return (error); + } + } + } + return (0); +} + +void +xfs_qm_vop_dqattach_and_dqmod_newinode( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t *udqp, + xfs_dquot_t *gdqp) +{ + if (!XFS_IS_QUOTA_ON(tp->t_mountp)) + return; + + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + + if (udqp) { + xfs_dqlock(udqp); + XFS_DQHOLD(udqp); + xfs_dqunlock(udqp); + ASSERT(ip->i_udquot == NULL); + ip->i_udquot = udqp; + ASSERT(ip->i_d.di_uid == INT_GET(udqp->q_core.d_id, ARCH_CONVERT)); + xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (gdqp) { + xfs_dqlock(gdqp); + XFS_DQHOLD(gdqp); + xfs_dqunlock(gdqp); + ASSERT(ip->i_gdquot == NULL); + ip->i_gdquot = gdqp; + ASSERT(ip->i_d.di_gid == INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)); + xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); + } +} + +/* ------------- list stuff -----------------*/ +void +xfs_qm_freelist_init(xfs_frlist_t *ql) +{ + ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; + mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf"); + ql->qh_version = 0; + ql->qh_nelems = 0; +} + +void +xfs_qm_freelist_destroy(xfs_frlist_t *ql) +{ + xfs_dquot_t *dqp, *nextdqp; + + mutex_lock(&ql->qh_lock, PINOD); + for (dqp = ql->qh_next; + dqp != (xfs_dquot_t *)ql; ) { + xfs_dqlock(dqp); + nextdqp = dqp->dq_flnext; +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); +#endif + XQM_FREELIST_REMOVE(dqp); + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + dqp = nextdqp; + } + /* + * Don't bother about unlocking. + */ + mutex_destroy(&ql->qh_lock); + + ASSERT(ql->qh_nelems == 0); +} + +void +xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) +{ + dq->dq_flnext = ql->qh_next; + dq->dq_flprev = (xfs_dquot_t *)ql; + ql->qh_next = dq; + dq->dq_flnext->dq_flprev = dq; + xfs_Gqm->qm_dqfreelist.qh_nelems++; + xfs_Gqm->qm_dqfreelist.qh_version++; +} + +void +xfs_qm_freelist_unlink(xfs_dquot_t *dq) +{ + xfs_dquot_t *next = dq->dq_flnext; + xfs_dquot_t *prev = dq->dq_flprev; + + next->dq_flprev = prev; + prev->dq_flnext = next; + dq->dq_flnext = dq->dq_flprev = dq; + xfs_Gqm->qm_dqfreelist.qh_nelems--; + xfs_Gqm->qm_dqfreelist.qh_version++; +} + +void +xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) +{ + xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); +} + +int +xfs_qm_dqhashlock_nowait( + xfs_dquot_t *dqp) +{ + int locked; + + locked = mutex_trylock(&((dqp)->q_hash->qh_lock)); + return (locked); +} + +int +xfs_qm_freelist_lock_nowait( + xfs_qm_t *xqm) +{ + int locked; + + locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock)); + return (locked); +} + +int +xfs_qm_mplist_nowait( + xfs_mount_t *mp) +{ + int locked; + + ASSERT(mp->m_quotainfo); + locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp))); + return (locked); +} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h new file mode 100644 index 00000000000..dcf1a7a831d --- /dev/null +++ b/fs/xfs/quota/xfs_qm.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_QM_H__ +#define __XFS_QM_H__ + +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_quota_priv.h" +#include "xfs_qm_stats.h" + +struct xfs_qm; +struct xfs_inode; + +extern mutex_t xfs_Gqm_lock; +extern struct xfs_qm *xfs_Gqm; +extern kmem_zone_t *qm_dqzone; +extern kmem_zone_t *qm_dqtrxzone; + +/* + * Used in xfs_qm_sync called by xfs_sync to count the max times that it can + * iterate over the mountpt's dquot list in one call. + */ +#define XFS_QM_SYNC_MAX_RESTARTS 7 + +/* + * Ditto, for xfs_qm_dqreclaim_one. + */ +#define XFS_QM_RECLAIM_MAX_RESTARTS 4 + +/* + * Ideal ratio of free to in use dquots. Quota manager makes an attempt + * to keep this balance. + */ +#define XFS_QM_DQFREE_RATIO 2 + +/* + * Dquot hashtable constants/threshold values. + */ +#define XFS_QM_NCSIZE_THRESHOLD 5000 +#define XFS_QM_HASHSIZE_LOW 32 +#define XFS_QM_HASHSIZE_HIGH 64 + +/* + * We output a cmn_err when quotachecking a quota file with more than + * this many fsbs. + */ +#define XFS_QM_BIG_QCHECK_NBLKS 500 + +/* + * This defines the unit of allocation of dquots. + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * XXXsup However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 +/* + * When doing a quotacheck, we log dquot clusters of this many FSBs at most + * in a single transaction. We don't want to ask for too huge a log reservation. + */ +#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 + +typedef xfs_dqhash_t xfs_dqlist_t; +/* + * The freelist head. The first two fields match the first two in the + * xfs_dquot_t structure (in xfs_dqmarker_t) + */ +typedef struct xfs_frlist { + struct xfs_dquot *qh_next; + struct xfs_dquot *qh_prev; + mutex_t qh_lock; + uint qh_version; + uint qh_nelems; +} xfs_frlist_t; + +/* + * Quota Manager (global) structure. Lives only in core. + */ +typedef struct xfs_qm { + xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ + xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ + uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ + xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ + atomic_t qm_totaldquots; /* total incore dquots */ + uint qm_nrefs; /* file systems with quota on */ + int qm_dqfree_ratio;/* ratio of free to inuse dquots */ + kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ + kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ +} xfs_qm_t; + +/* + * Various quota information for individual filesystems. + * The mount structure keeps a pointer to this. + */ +typedef struct xfs_quotainfo { + xfs_inode_t *qi_uquotaip; /* user quota inode */ + xfs_inode_t *qi_gquotaip; /* group quota inode */ + lock_t qi_pinlock; /* dquot pinning mutex */ + xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ + int qi_dqreclaims; /* a change here indicates + a removal in the dqlist */ + time_t qi_btimelimit; /* limit for blks timer */ + time_t qi_itimelimit; /* limit for inodes timer */ + time_t qi_rtbtimelimit;/* limit for rt blks timer */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for num warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for num warnings */ + mutex_t qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ + xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ +} xfs_quotainfo_t; + + +extern xfs_dqtrxops_t xfs_trans_dquot_ops; + +extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, + xfs_dquot_t *, xfs_dquot_t *, long, long, uint); +extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); +extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); + +/* + * We keep the usr and grp dquots separately so that locking will be easier + * to do at commit time. All transactions that we know of at this point + * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. + */ +#define XFS_QM_TRANS_MAXDQS 2 +typedef struct xfs_dquot_acct { + xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; + xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; +} xfs_dquot_acct_t; + +/* + * Users are allowed to have a usage exceeding their softlimit for + * a period this long. + */ +#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ + +#define XFS_QM_BWARNLIMIT 5 +#define XFS_QM_IWARNLIMIT 5 + +#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) +#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) +#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) +#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) + +extern void xfs_mount_reset_sbqflags(xfs_mount_t *); + +extern int xfs_qm_init_quotainfo(xfs_mount_t *); +extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); +extern int xfs_qm_mount_quotas(xfs_mount_t *, int); +extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint); +extern int xfs_qm_quotacheck(xfs_mount_t *); +extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); +extern int xfs_qm_unmount_quotas(xfs_mount_t *); +extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); +extern int xfs_qm_sync(xfs_mount_t *, short); + +/* dquot stuff */ +extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); +extern int xfs_qm_dqattach(xfs_inode_t *, uint); +extern void xfs_qm_dqdetach(xfs_inode_t *); +extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); +extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); + +/* vop stuff */ +extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, + uid_t, gid_t, uint, + xfs_dquot_t **, xfs_dquot_t **); +extern void xfs_qm_vop_dqattach_and_dqmod_newinode( + xfs_trans_t *, xfs_inode_t *, + xfs_dquot_t *, xfs_dquot_t *); +extern int xfs_qm_vop_rename_dqattach(xfs_inode_t **); +extern xfs_dquot_t * xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *, + xfs_dquot_t **, xfs_dquot_t *); +extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, + xfs_dquot_t *, xfs_dquot_t *, uint); + +/* list stuff */ +extern void xfs_qm_freelist_init(xfs_frlist_t *); +extern void xfs_qm_freelist_destroy(xfs_frlist_t *); +extern void xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *); +extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); +extern void xfs_qm_freelist_unlink(xfs_dquot_t *); +extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); +extern int xfs_qm_mplist_nowait(xfs_mount_t *); +extern int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); + +/* system call interface */ +extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t); + +#ifdef DEBUG +extern int xfs_qm_internalqcheck(xfs_mount_t *); +#else +#define xfs_qm_internalqcheck(mp) (0) +#endif + +#endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c new file mode 100644 index 00000000000..be67d9c265f --- /dev/null +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" + +#include "xfs_qm.h" + +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ + +STATIC int +xfs_qm_parseargs( + struct bhv_desc *bhv, + char *options, + struct xfs_mount_args *args, + int update) +{ + size_t length; + char *local_options = options; + char *this_char; + int error; + int referenced = update; + + while ((this_char = strsep(&local_options, ",")) != NULL) { + length = strlen(this_char); + if (local_options) + length++; + + if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); + args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); + referenced = update; + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; + referenced = 1; + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + args->flags |= XFSMNT_UQUOTA; + args->flags &= ~XFSMNT_UQUOTAENF; + referenced = 1; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; + referenced = 1; + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + args->flags |= XFSMNT_GQUOTA; + args->flags &= ~XFSMNT_GQUOTAENF; + referenced = 1; + } else { + if (local_options) + *(local_options-1) = ','; + continue; + } + + while (length--) + *this_char++ = ','; + } + + PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error); + if (!error && !referenced) + bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM); + return error; +} + +STATIC int +xfs_qm_showargs( + struct bhv_desc *bhv, + struct seq_file *m) +{ + struct vfs *vfsp = bhvtovfs(bhv); + struct xfs_mount *mp = XFS_VFSTOM(vfsp); + int error; + + if (mp->m_qflags & XFS_UQUOTA_ACCT) { + (mp->m_qflags & XFS_UQUOTA_ENFD) ? + seq_puts(m, "," MNTOPT_USRQUOTA) : + seq_puts(m, "," MNTOPT_UQUOTANOENF); + } + + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + (mp->m_qflags & XFS_GQUOTA_ENFD) ? + seq_puts(m, "," MNTOPT_GRPQUOTA) : + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } + + if (!(mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) + seq_puts(m, "," MNTOPT_NOQUOTA); + + PVFS_SHOWARGS(BHV_NEXT(bhv), m, error); + return error; +} + +STATIC int +xfs_qm_mount( + struct bhv_desc *bhv, + struct xfs_mount_args *args, + struct cred *cr) +{ + struct vfs *vfsp = bhvtovfs(bhv); + struct xfs_mount *mp = XFS_VFSTOM(vfsp); + int error; + + if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA)) + xfs_qm_mount_quotainit(mp, args->flags); + PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error); + return error; +} + +STATIC int +xfs_qm_syncall( + struct bhv_desc *bhv, + int flags, + cred_t *credp) +{ + struct vfs *vfsp = bhvtovfs(bhv); + struct xfs_mount *mp = XFS_VFSTOM(vfsp); + int error; + + /* + * Get the Quota Manager to flush the dquots. + */ + if (XFS_IS_QUOTA_ON(mp)) { + if ((error = xfs_qm_sync(mp, flags))) { + /* + * If we got an IO error, we will be shutting down. + * So, there's nothing more for us to do here. + */ + ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp)); + if (XFS_FORCED_SHUTDOWN(mp)) { + return XFS_ERROR(error); + } + } + } + PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error); + return error; +} + +/* + * Clear the quotaflags in memory and in the superblock. + */ +void +xfs_mount_reset_sbqflags( + xfs_mount_t *mp) +{ + xfs_trans_t *tp; + unsigned long s; + + mp->m_qflags = 0; + /* + * It is OK to look at sb_qflags here in mount path, + * without SB_LOCK. + */ + if (mp->m_sb.sb_qflags == 0) + return; + s = XFS_SB_LOCK(mp); + mp->m_sb.sb_qflags = 0; + XFS_SB_UNLOCK(mp, s); + + /* + * if the fs is readonly, let the incore superblock run + * with quotas off but don't flush the update out to disk + */ + if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + return; +#ifdef QUOTADEBUG + xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); +#endif + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT)) { + xfs_trans_cancel(tp, 0); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_mount_reset_sbqflags: Superblock update failed!"); + return; + } + xfs_mod_sb(tp, XFS_SB_QFLAGS); + xfs_trans_commit(tp, 0, NULL); +} + +STATIC int +xfs_qm_newmount( + xfs_mount_t *mp, + uint *needquotamount, + uint *quotaflags) +{ + uint quotaondisk; + uint uquotaondisk = 0, gquotaondisk = 0; + + *quotaflags = 0; + *needquotamount = B_FALSE; + + quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && + mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT); + + if (quotaondisk) { + uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; + } + + /* + * If the device itself is read-only, we can't allow + * the user to change the state of quota on the mount - + * this would generate a transaction on the ro device, + * which would lead to an I/O error and shutdown + */ + + if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || + (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || + (!gquotaondisk && XFS_IS_GQUOTA_ON(mp))) && + xfs_dev_is_read_only(mp, "changing quota state")) { + cmn_err(CE_WARN, + "XFS: please mount with%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : "")); + return XFS_ERROR(EPERM); + } + + if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { + /* + * Call mount_quotas at this point only if we won't have to do + * a quotacheck. + */ + if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { + /* + * If an error occured, qm_mount_quotas code + * has already disabled quotas. So, just finish + * mounting, and get on with the boring life + * without disk quotas. + */ + xfs_qm_mount_quotas(mp, 0); + } else { + /* + * Clear the quota flags, but remember them. This + * is so that the quota code doesn't get invoked + * before we're ready. This can happen when an + * inode goes inactive and wants to free blocks, + * or via xfs_log_mount_finish. + */ + *needquotamount = B_TRUE; + *quotaflags = mp->m_qflags; + mp->m_qflags = 0; + } + } + + return 0; +} + +STATIC int +xfs_qm_endmount( + xfs_mount_t *mp, + uint needquotamount, + uint quotaflags, + int mfsi_flags) +{ + if (needquotamount) { + ASSERT(mp->m_qflags == 0); + mp->m_qflags = quotaflags; + xfs_qm_mount_quotas(mp, mfsi_flags); + } + +#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) + if (! (XFS_IS_QUOTA_ON(mp))) + xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); + else + xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); +#endif + +#ifdef QUOTADEBUG + if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp)) + cmn_err(CE_WARN, "XFS: mount internalqcheck failed"); +#endif + + return 0; +} + +STATIC void +xfs_qm_dqrele_null( + xfs_dquot_t *dq) +{ + /* + * Called from XFS, where we always check first for a NULL dquot. + */ + if (!dq) + return; + xfs_qm_dqrele(dq); +} + + +struct xfs_qmops xfs_qmcore_xfs = { + .xfs_qminit = xfs_qm_newmount, + .xfs_qmdone = xfs_qm_unmount_quotadestroy, + .xfs_qmmount = xfs_qm_endmount, + .xfs_qmunmount = xfs_qm_unmount_quotas, + .xfs_dqrele = xfs_qm_dqrele_null, + .xfs_dqattach = xfs_qm_dqattach, + .xfs_dqdetach = xfs_qm_dqdetach, + .xfs_dqpurgeall = xfs_qm_dqpurge_all, + .xfs_dqvopalloc = xfs_qm_vop_dqalloc, + .xfs_dqvopcreate = xfs_qm_vop_dqattach_and_dqmod_newinode, + .xfs_dqvoprename = xfs_qm_vop_rename_dqattach, + .xfs_dqvopchown = xfs_qm_vop_chown, + .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, + .xfs_dqtrxops = &xfs_trans_dquot_ops, +}; + +struct bhv_vfsops xfs_qmops = { { + BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM), + .vfs_parseargs = xfs_qm_parseargs, + .vfs_showargs = xfs_qm_showargs, + .vfs_mount = xfs_qm_mount, + .vfs_sync = xfs_qm_syncall, + .vfs_quotactl = xfs_qm_quotactl, }, +}; + + +void __init +xfs_qm_init(void) +{ + static char message[] __initdata = + KERN_INFO "SGI XFS Quota Management subsystem\n"; + + printk(message); + mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock"); + vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs); + xfs_qm_init_procfs(); +} + +void __exit +xfs_qm_exit(void) +{ + vfs_bhv_clr_custom(&xfs_qmops); + xfs_qm_cleanup_procfs(); + if (qm_dqzone) + kmem_cache_destroy(qm_dqzone); + if (qm_dqtrxzone) + kmem_cache_destroy(qm_dqtrxzone); +} diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c new file mode 100644 index 00000000000..29978e037fe --- /dev/null +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" + +#include "xfs_qm.h" + +struct xqmstats xqmstats; + +STATIC int +xfs_qm_read_xfsquota( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int len; + + /* maximum; incore; ratio free to inuse; freelist */ + len = sprintf(buffer, "%d\t%d\t%d\t%u\n", + ndquot, + xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, + xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +STATIC int +xfs_qm_read_stats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int len; + + /* quota performance statistics */ + len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", + xqmstats.xs_qm_dqreclaims, + xqmstats.xs_qm_dqreclaim_misses, + xqmstats.xs_qm_dquot_dups, + xqmstats.xs_qm_dqcachemisses, + xqmstats.xs_qm_dqcachehits, + xqmstats.xs_qm_dqwants, + xqmstats.xs_qm_dqshake_reclaims, + xqmstats.xs_qm_dqinact_reclaims); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +void +xfs_qm_init_procfs(void) +{ + create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); + create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); +} + +void +xfs_qm_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +} diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h new file mode 100644 index 00000000000..8093c5c284e --- /dev/null +++ b/fs/xfs/quota/xfs_qm_stats.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_QM_STATS_H__ +#define __XFS_QM_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +/* + * XQM global statistics + */ +struct xqmstats { + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; + __uint32_t xs_qm_dqshake_reclaims; + __uint32_t xs_qm_dqinact_reclaims; +}; + +extern struct xqmstats xqmstats; + +# define XQM_STATS_INC(count) ( (count)++ ) + +extern void xfs_qm_init_procfs(void); +extern void xfs_qm_cleanup_procfs(void); + +#else + +# define XQM_STATS_INC(count) do { } while (0) + +static __inline void xfs_qm_init_procfs(void) { }; +static __inline void xfs_qm_cleanup_procfs(void) { }; + +#endif + +#endif /* __XFS_QM_STATS_H__ */ diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c new file mode 100644 index 00000000000..229f5b5a2d2 --- /dev/null +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -0,0 +1,1458 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" + +#include "xfs_qm.h" + +#ifdef DEBUG +# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args) +#else +# define qdprintk(s, args...) do { } while (0) +#endif + +STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); +STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); +STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint); +STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t); +STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); +STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, + uint); +STATIC uint xfs_qm_import_flags(uint); +STATIC uint xfs_qm_export_flags(uint); +STATIC uint xfs_qm_import_qtype_flags(uint); +STATIC uint xfs_qm_export_qtype_flags(uint); +STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, + fs_disk_quota_t *); + + +/* + * The main distribution switch of all XFS quotactl system calls. + */ +int +xfs_qm_quotactl( + struct bhv_desc *bdp, + int cmd, + int id, + xfs_caddr_t addr) +{ + xfs_mount_t *mp; + int error; + struct vfs *vfsp; + + vfsp = bhvtovfs(bdp); + mp = XFS_VFSTOM(vfsp); + + if (addr == NULL && cmd != Q_SYNC) + return XFS_ERROR(EINVAL); + if (id < 0 && cmd != Q_SYNC) + return XFS_ERROR(EINVAL); + + /* + * The following commands are valid even when quotaoff. + */ + switch (cmd) { + /* + * truncate quota files. quota must be off. + */ + case Q_XQUOTARM: + if (XFS_IS_QUOTA_ON(mp) || addr == NULL) + return XFS_ERROR(EINVAL); + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + return (xfs_qm_scall_trunc_qfiles(mp, + xfs_qm_import_qtype_flags(*(uint *)addr))); + /* + * Get quota status information. + */ + case Q_XGETQSTAT: + return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr)); + + /* + * QUOTAON for root f/s and quota enforcement on others.. + * Quota accounting for non-root f/s's must be turned on + * at mount time. + */ + case Q_XQUOTAON: + if (addr == NULL) + return XFS_ERROR(EINVAL); + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + return (xfs_qm_scall_quotaon(mp, + xfs_qm_import_flags(*(uint *)addr))); + case Q_XQUOTAOFF: + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + break; + + default: + break; + } + + if (! XFS_IS_QUOTA_ON(mp)) + return XFS_ERROR(ESRCH); + + switch (cmd) { + case Q_XQUOTAOFF: + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + error = xfs_qm_scall_quotaoff(mp, + xfs_qm_import_flags(*(uint *)addr), + B_FALSE); + break; + + /* + * Defaults to XFS_GETUQUOTA. + */ + case Q_XGETQUOTA: + error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER, + (fs_disk_quota_t *)addr); + break; + /* + * Set limits, both hard and soft. Defaults to Q_SETUQLIM. + */ + case Q_XSETQLIM: + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, + (fs_disk_quota_t *)addr); + break; + + case Q_XSETGQLIM: + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, + (fs_disk_quota_t *)addr); + break; + + + case Q_XGETGQUOTA: + error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, + (fs_disk_quota_t *)addr); + break; + + /* + * Quotas are entirely undefined after quotaoff in XFS quotas. + * For instance, there's no way to set limits when quotaoff. + */ + + default: + error = XFS_ERROR(EINVAL); + break; + } + + return (error); +} + +/* + * Turn off quota accounting and/or enforcement for all udquots and/or + * gdquots. Called only at unmount time. + * + * This assumes that there are no dquots of this file system cached + * incore, and modifies the ondisk dquot directly. Therefore, for example, + * it is an error to call this twice, without purging the cache. + */ +STATIC int +xfs_qm_scall_quotaoff( + xfs_mount_t *mp, + uint flags, + boolean_t force) +{ + uint dqtype; + unsigned long s; + int error; + uint inactivate_flags; + xfs_qoff_logitem_t *qoffstart; + int nculprits; + + if (!force && !capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + /* + * No file system can have quotas enabled on disk but not in core. + * Note that quota utilities (like quotaoff) _expect_ + * errno == EEXIST here. + */ + if ((mp->m_qflags & flags) == 0) + return XFS_ERROR(EEXIST); + error = 0; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + + /* + * We don't want to deal with two quotaoffs messing up each other, + * so we're going to serialize it. quotaoff isn't exactly a performance + * critical thing. + * If quotaoff, then we must be dealing with the root filesystem. + */ + ASSERT(mp->m_quotainfo); + if (mp->m_quotainfo) + mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + + ASSERT(mp->m_quotainfo); + + /* + * If we're just turning off quota enforcement, change mp and go. + */ + if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { + mp->m_qflags &= ~(flags); + + s = XFS_SB_LOCK(mp); + mp->m_sb.sb_qflags = mp->m_qflags; + XFS_SB_UNLOCK(mp, s); + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + + /* XXX what to do if error ? Revert back to old vals incore ? */ + error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); + return (error); + } + + dqtype = 0; + inactivate_flags = 0; + /* + * If accounting is off, we must turn enforcement off, clear the + * quota 'CHKD' certificate to make it known that we have to + * do a quotacheck the next time this quota is turned on. + */ + if (flags & XFS_UQUOTA_ACCT) { + dqtype |= XFS_QMOPT_UQUOTA; + flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); + inactivate_flags |= XFS_UQUOTA_ACTIVE; + } + if (flags & XFS_GQUOTA_ACCT) { + dqtype |= XFS_QMOPT_GQUOTA; + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); + inactivate_flags |= XFS_GQUOTA_ACTIVE; + } + + /* + * Nothing to do? Don't complain. This happens when we're just + * turning off quota enforcement. + */ + if ((mp->m_qflags & flags) == 0) { + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + return (0); + } + + /* + * Write the LI_QUOTAOFF log record, and do SB changes atomically, + * and synchronously. + */ + xfs_qm_log_quotaoff(mp, &qoffstart, flags); + + /* + * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct + * to take care of the race between dqget and quotaoff. We don't take + * any special locks to reset these bits. All processes need to check + * these bits *after* taking inode lock(s) to see if the particular + * quota type is in the process of being turned off. If *ACTIVE, it is + * guaranteed that all dquot structures and all quotainode ptrs will all + * stay valid as long as that inode is kept locked. + * + * There is no turning back after this. + */ + mp->m_qflags &= ~inactivate_flags; + + /* + * Give back all the dquot reference(s) held by inodes. + * Here we go thru every single incore inode in this file system, and + * do a dqrele on the i_udquot/i_gdquot that it may have. + * Essentially, as long as somebody has an inode locked, this guarantees + * that quotas will not be turned off. This is handy because in a + * transaction once we lock the inode(s) and check for quotaon, we can + * depend on the quota inodes (and other things) being valid as long as + * we keep the lock(s). + */ + xfs_qm_dqrele_all_inodes(mp, flags); + + /* + * Next we make the changes in the quota flag in the mount struct. + * This isn't protected by a particular lock directly, because we + * don't want to take a mrlock everytime we depend on quotas being on. + */ + mp->m_qflags &= ~(flags); + + /* + * Go through all the dquots of this file system and purge them, + * according to what was turned off. We may not be able to get rid + * of all dquots, because dquots can have temporary references that + * are not attached to inodes. eg. xfs_setattr, xfs_create. + * So, if we couldn't purge all the dquots from the filesystem, + * we can't get rid of the incore data structures. + */ + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) + delay(10 * nculprits); + + /* + * Transactions that had started before ACTIVE state bit was cleared + * could have logged many dquots, so they'd have higher LSNs than + * the first QUOTAOFF log record does. If we happen to crash when + * the tail of the log has gone past the QUOTAOFF record, but + * before the last dquot modification, those dquots __will__ + * recover, and that's not good. + * + * So, we have QUOTAOFF start and end logitems; the start + * logitem won't get overwritten until the end logitem appears... + */ + xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + + /* + * If quotas is completely disabled, close shop. + */ + if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) { + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + xfs_qm_destroy_quotainfo(mp); + return (0); + } + + /* + * Release our quotainode references, and vn_purge them, + * if we don't need them anymore. + */ + if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { + XFS_PURGE_INODE(XFS_QI_UQIP(mp)); + XFS_QI_UQIP(mp) = NULL; + } + if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) { + XFS_PURGE_INODE(XFS_QI_GQIP(mp)); + XFS_QI_GQIP(mp) = NULL; + } + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + + return (error); +} + +STATIC int +xfs_qm_scall_trunc_qfiles( + xfs_mount_t *mp, + uint flags) +{ + int error; + xfs_inode_t *qip; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + error = 0; + if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) { + qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); + if (! error) { + (void) xfs_truncate_file(mp, qip); + VN_RELE(XFS_ITOV(qip)); + } + } + + if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) { + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); + if (! error) { + (void) xfs_truncate_file(mp, qip); + VN_RELE(XFS_ITOV(qip)); + } + } + + return (error); +} + + +/* + * Switch on (a given) quota enforcement for a filesystem. This takes + * effect immediately. + * (Switching on quota accounting must be done at mount time.) + */ +STATIC int +xfs_qm_scall_quotaon( + xfs_mount_t *mp, + uint flags) +{ + int error; + unsigned long s; + uint qf; + uint accflags; + __int64_t sbflags; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + /* + * Switching on quota accounting must be done at mount time. + */ + accflags = flags & XFS_ALL_QUOTA_ACCT; + flags &= ~(XFS_ALL_QUOTA_ACCT); + + sbflags = 0; + + if (flags == 0) { + qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + /* No fs can turn on quotas with a delayed effect */ + ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0); + + /* + * Can't enforce without accounting. We check the superblock + * qflags here instead of m_qflags because rootfs can have + * quota acct on ondisk without m_qflags' knowing. + */ + if (((flags & XFS_UQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) + || + ((flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD))) { + qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", + flags, mp->m_sb.sb_qflags); + return XFS_ERROR(EINVAL); + } + /* + * If everything's upto-date incore, then don't waste time. + */ + if ((mp->m_qflags & flags) == flags) + return XFS_ERROR(EEXIST); + + /* + * Change sb_qflags on disk but not incore mp->qflags + * if this is the root filesystem. + */ + s = XFS_SB_LOCK(mp); + qf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = qf | flags; + XFS_SB_UNLOCK(mp, s); + + /* + * There's nothing to change if it's the same. + */ + if ((qf & flags) == flags && sbflags == 0) + return XFS_ERROR(EEXIST); + sbflags |= XFS_SB_QFLAGS; + + if ((error = xfs_qm_write_sb_changes(mp, sbflags))) + return (error); + /* + * If we aren't trying to switch on quota enforcement, we are done. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != + (mp->m_qflags & XFS_UQUOTA_ACCT)) || + (flags & XFS_ALL_QUOTA_ENFD) == 0) + return (0); + + if (! XFS_IS_QUOTA_RUNNING(mp)) + return XFS_ERROR(ESRCH); + + /* + * Switch on quota enforcement in core. + */ + mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + + return (0); +} + + + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + */ +STATIC int +xfs_qm_scall_getqstat( + xfs_mount_t *mp, + fs_quota_stat_t *out) +{ + xfs_inode_t *uip, *gip; + boolean_t tempuqip, tempgqip; + + uip = gip = NULL; + tempuqip = tempgqip = B_FALSE; + memset(out, 0, sizeof(fs_quota_stat_t)); + + out->qs_version = FS_QSTAT_VERSION; + if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + return (0); + } + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + out->qs_pad = 0; + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + + if (mp->m_quotainfo) { + uip = mp->m_quotainfo->qi_uquotaip; + gip = mp->m_quotainfo->qi_gquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip, 0) == 0) + tempuqip = B_TRUE; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip, 0) == 0) + tempgqip = B_TRUE; + } + if (uip) { + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + VN_RELE(XFS_ITOV(uip)); + } + if (gip) { + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + VN_RELE(XFS_ITOV(gip)); + } + if (mp->m_quotainfo) { + out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); + out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); + out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); + out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); + out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); + out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); + } + return (0); +} + +/* + * Adjust quota limits, and start/stop timers accordingly. + */ +STATIC int +xfs_qm_scall_setqlim( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + fs_disk_quota_t *newlim) +{ + xfs_disk_dquot_t *ddq; + xfs_dquot_t *dqp; + xfs_trans_t *tp; + int error; + xfs_qcnt_t hard, soft; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0) + return (0); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, + 0, 0, XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + + /* + * We don't want to race with a quotaoff so take the quotaoff lock. + * (We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening). (XXXThis doesn't currently happen + * because we take the vfslock before calling xfs_qm_sysent). + */ + mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + + /* + * Get the dquot (locked), and join it to the transaction. + * Allocate the dquot if this doesn't exist. + */ + if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + ASSERT(error != ENOENT); + return (error); + } + xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET"); + xfs_trans_dqjoin(tp, dqp); + ddq = &dqp->q_core; + + /* + * Make sure that hardlimits are >= soft limits before changing. + */ + hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : + INT_GET(ddq->d_blk_hardlimit, ARCH_CONVERT); + soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : + INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT); + if (hard == 0 || hard >= soft) { + INT_SET(ddq->d_blk_hardlimit, ARCH_CONVERT, hard); + INT_SET(ddq->d_blk_softlimit, ARCH_CONVERT, soft); + if (id == 0) { + mp->m_quotainfo->qi_bhardlimit = hard; + mp->m_quotainfo->qi_bsoftlimit = soft; + } + } else { + qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); + } + hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : + INT_GET(ddq->d_rtb_hardlimit, ARCH_CONVERT); + soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : + INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT); + if (hard == 0 || hard >= soft) { + INT_SET(ddq->d_rtb_hardlimit, ARCH_CONVERT, hard); + INT_SET(ddq->d_rtb_softlimit, ARCH_CONVERT, soft); + if (id == 0) { + mp->m_quotainfo->qi_rtbhardlimit = hard; + mp->m_quotainfo->qi_rtbsoftlimit = soft; + } + } else { + qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); + } + + hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + INT_GET(ddq->d_ino_hardlimit, ARCH_CONVERT); + soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT); + if (hard == 0 || hard >= soft) { + INT_SET(ddq->d_ino_hardlimit, ARCH_CONVERT, hard); + INT_SET(ddq->d_ino_softlimit, ARCH_CONVERT, soft); + if (id == 0) { + mp->m_quotainfo->qi_ihardlimit = hard; + mp->m_quotainfo->qi_isoftlimit = soft; + } + } else { + qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); + } + + if (id == 0) { + /* + * Timelimits for the super user set the relative time + * the other users can be over quota for this file system. + * If it is zero a default is used. Ditto for the default + * soft and hard limit values (already done, above). + */ + if (newlim->d_fieldmask & FS_DQ_BTIMER) { + mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; + INT_SET(ddq->d_btimer, ARCH_CONVERT, newlim->d_btimer); + } + if (newlim->d_fieldmask & FS_DQ_ITIMER) { + mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; + INT_SET(ddq->d_itimer, ARCH_CONVERT, newlim->d_itimer); + } + if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { + mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; + INT_SET(ddq->d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer); + } + } else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ { + /* + * If the user is now over quota, start the timelimit. + * The user will not be 'warned'. + * Note that we keep the timers ticking, whether enforcement + * is on or off. We don't really want to bother with iterating + * over all ondisk dquots and turning the timers on/off. + */ + xfs_qm_adjust_dqtimers(mp, ddq); + } + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_trans_log_dquot(tp, dqp); + + xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); + xfs_trans_commit(tp, 0, NULL); + xfs_qm_dqprint(dqp); + xfs_qm_dqrele(dqp); + mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + + return (0); +} + +STATIC int +xfs_qm_scall_getquota( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + fs_disk_quota_t *out) +{ + xfs_dquot_t *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) { + return (error); + } + + xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS"); + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + xfs_qm_dqput(dqp); + return XFS_ERROR(ENOENT); + } + /* xfs_qm_dqprint(dqp); */ + /* + * Convert the disk dquot to the exportable format + */ + xfs_qm_export_dquot(mp, &dqp->q_core, out); + xfs_qm_dqput(dqp); + return (error ? XFS_ERROR(EFAULT) : 0); +} + + +STATIC int +xfs_qm_log_quotaoff_end( + xfs_mount_t *mp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); + + if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, + 0, 0, XFS_DEFAULT_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + return (error); + } + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + return (error); +} + + +STATIC int +xfs_qm_log_quotaoff( + xfs_mount_t *mp, + xfs_qoff_logitem_t **qoffstartp, + uint flags) +{ + xfs_trans_t *tp; + int error; + unsigned long s; + xfs_qoff_logitem_t *qoffi=NULL; + uint oldsbqflag=0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); + if ((error = xfs_trans_reserve(tp, 0, + sizeof(xfs_qoff_logitem_t) * 2 + + mp->m_sb.sb_sectsize + 128, + 0, + 0, + XFS_DEFAULT_LOG_COUNT))) { + goto error0; + } + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + s = XFS_SB_LOCK(mp); + oldsbqflag = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + XFS_SB_UNLOCK(mp, s); + + xfs_mod_sb(tp, XFS_SB_QFLAGS); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + +error0: + if (error) { + xfs_trans_cancel(tp, 0); + /* + * No one else is modifying sb_qflags, so this is OK. + * We still hold the quotaofflock. + */ + s = XFS_SB_LOCK(mp); + mp->m_sb.sb_qflags = oldsbqflag; + XFS_SB_UNLOCK(mp, s); + } + *qoffstartp = qoffi; + return (error); +} + + +/* + * Translate an internal style on-disk-dquot to the exportable format. + * The main differences are that the counters/limits are all in Basic + * Blocks (BBs) instead of the internal FSBs, and all on-disk data has + * to be converted to the native endianness. + */ +STATIC void +xfs_qm_export_dquot( + xfs_mount_t *mp, + xfs_disk_dquot_t *src, + struct fs_disk_quota *dst) +{ + memset(dst, 0, sizeof(*dst)); + dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ + dst->d_flags = + xfs_qm_export_qtype_flags(INT_GET(src->d_flags, ARCH_CONVERT)); + dst->d_id = INT_GET(src->d_id, ARCH_CONVERT); + dst->d_blk_hardlimit = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_hardlimit, ARCH_CONVERT)); + dst->d_blk_softlimit = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_softlimit, ARCH_CONVERT)); + dst->d_ino_hardlimit = (__uint64_t) + INT_GET(src->d_ino_hardlimit, ARCH_CONVERT); + dst->d_ino_softlimit = (__uint64_t) + INT_GET(src->d_ino_softlimit, ARCH_CONVERT); + dst->d_bcount = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_bcount, ARCH_CONVERT)); + dst->d_icount = (__uint64_t) INT_GET(src->d_icount, ARCH_CONVERT); + dst->d_btimer = (__uint32_t) INT_GET(src->d_btimer, ARCH_CONVERT); + dst->d_itimer = (__uint32_t) INT_GET(src->d_itimer, ARCH_CONVERT); + dst->d_iwarns = INT_GET(src->d_iwarns, ARCH_CONVERT); + dst->d_bwarns = INT_GET(src->d_bwarns, ARCH_CONVERT); + + dst->d_rtb_hardlimit = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_hardlimit, ARCH_CONVERT)); + dst->d_rtb_softlimit = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_softlimit, ARCH_CONVERT)); + dst->d_rtbcount = (__uint64_t) + XFS_FSB_TO_BB(mp, INT_GET(src->d_rtbcount, ARCH_CONVERT)); + dst->d_rtbtimer = (__uint32_t) INT_GET(src->d_rtbtimer, ARCH_CONVERT); + dst->d_rtbwarns = INT_GET(src->d_rtbwarns, ARCH_CONVERT); + + /* + * Internally, we don't reset all the timers when quota enforcement + * gets turned off. No need to confuse the userlevel code, + * so return zeroes in that case. + */ + if (! XFS_IS_QUOTA_ENFORCED(mp)) { + dst->d_btimer = 0; + dst->d_itimer = 0; + dst->d_rtbtimer = 0; + } + +#ifdef DEBUG + if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) { + if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + (dst->d_blk_softlimit > 0)) { + ASSERT(dst->d_btimer != 0); + } + if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + (dst->d_ino_softlimit > 0)) { + ASSERT(dst->d_itimer != 0); + } + } +#endif +} + +STATIC uint +xfs_qm_import_qtype_flags( + uint uflags) +{ + /* + * Can't be both at the same time. + */ + if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == + (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || + ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0)) + return (0); + + return (uflags & XFS_USER_QUOTA) ? + XFS_DQ_USER : XFS_DQ_GROUP; +} + +STATIC uint +xfs_qm_export_qtype_flags( + uint flags) +{ + /* + * Can't be both at the same time. + */ + ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != + (XFS_GROUP_QUOTA | XFS_USER_QUOTA)); + ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0); + + return (flags & XFS_DQ_USER) ? + XFS_USER_QUOTA : XFS_GROUP_QUOTA; +} + +STATIC uint +xfs_qm_import_flags( + uint uflags) +{ + uint flags = 0; + + if (uflags & XFS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & XFS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & XFS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & XFS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + return (flags); +} + + +STATIC uint +xfs_qm_export_flags( + uint flags) +{ + uint uflags; + + uflags = 0; + if (flags & XFS_UQUOTA_ACCT) + uflags |= XFS_QUOTA_UDQ_ACCT; + if (flags & XFS_GQUOTA_ACCT) + uflags |= XFS_QUOTA_GDQ_ACCT; + if (flags & XFS_UQUOTA_ENFD) + uflags |= XFS_QUOTA_UDQ_ENFD; + if (flags & XFS_GQUOTA_ENFD) + uflags |= XFS_QUOTA_GDQ_ENFD; + return (uflags); +} + + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. This also gets called from + * xfs_rootumount. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + vmap_t vmap; + xfs_inode_t *ip, *topino; + uint ireclaims; + vnode_t *vp; + boolean_t vnode_refd; + + ASSERT(mp->m_quotainfo); + +again: + XFS_MOUNT_ILOCK(mp); + ip = mp->m_inodes; + if (ip == NULL) { + XFS_MOUNT_IUNLOCK(mp); + return; + } + do { + /* Skip markers inserted by xfs_sync */ + if (ip->i_mount == NULL) { + ip = ip->i_mnext; + continue; + } + /* Root inode, rbmip and rsumip have associated blocks */ + if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + ip = ip->i_mnext; + continue; + } + vp = XFS_ITOV_NULL(ip); + if (!vp) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + ip = ip->i_mnext; + continue; + } + vnode_refd = B_FALSE; + if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { + /* + * Sample vp mapping while holding the mplock, lest + * we come across a non-existent vnode. + */ + VMAP(vp, vmap); + ireclaims = mp->m_ireclaims; + topino = mp->m_inodes; + XFS_MOUNT_IUNLOCK(mp); + + /* XXX restart limit ? */ + if ( ! (vp = vn_get(vp, &vmap))) + goto again; + xfs_ilock(ip, XFS_ILOCK_EXCL); + vnode_refd = B_TRUE; + } else { + ireclaims = mp->m_ireclaims; + topino = mp->m_inodes; + XFS_MOUNT_IUNLOCK(mp); + } + + /* + * We don't keep the mountlock across the dqrele() call, + * since it can take a while.. + */ + if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * Wait until we've dropped the ilock and mountlock to + * do the vn_rele. Or be condemned to an eternity in the + * inactive code in hell. + */ + if (vnode_refd) + VN_RELE(vp); + XFS_MOUNT_ILOCK(mp); + /* + * If an inode was inserted or removed, we gotta + * start over again. + */ + if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) { + /* XXX use a sentinel */ + XFS_MOUNT_IUNLOCK(mp); + goto again; + } + ip = ip->i_mnext; + } while (ip != mp->m_inodes); + + XFS_MOUNT_IUNLOCK(mp); +} + +/*------------------------------------------------------------------------*/ +#ifdef DEBUG +/* + * This contains all the test functions for XFS disk quotas. + * Currently it does a quota accounting check. ie. it walks through + * all inodes in the file system, calculating the dquot accounting fields, + * and prints out any inconsistencies. + */ +xfs_dqhash_t *qmtest_udqtab; +xfs_dqhash_t *qmtest_gdqtab; +int qmtest_hashmask; +int qmtest_nfails; +mutex_t qcheck_lock; + +#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ + (__psunsigned_t)(id)) & \ + (qmtest_hashmask - 1)) + +#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \ + (qmtest_udqtab + \ + DQTEST_HASHVAL(mp, id)) : \ + (qmtest_gdqtab + \ + DQTEST_HASHVAL(mp, id))) + +#define DQTEST_LIST_PRINT(l, NXT, title) \ +{ \ + xfs_dqtest_t *dqp; int i = 0;\ + cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ + for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ + dqp = (xfs_dqtest_t *)dqp->NXT) { \ + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ + ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ + dqp->d_bcount, dqp->d_icount); } \ +} + +typedef struct dqtest { + xfs_dqmarker_t q_lists; + xfs_dqhash_t *q_hash; /* the hashchain header */ + xfs_mount_t *q_mount; /* filesystem this relates to */ + xfs_dqid_t d_id; /* user id or group id */ + xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */ + xfs_qcnt_t d_icount; /* # inodes owned by the user */ +} xfs_dqtest_t; + +STATIC void +xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) +{ + xfs_dquot_t *d; + if (((d) = (h)->qh_next)) + (d)->HL_PREVP = &((dqp)->HL_NEXT); + (dqp)->HL_NEXT = d; + (dqp)->HL_PREVP = &((h)->qh_next); + (h)->qh_next = (xfs_dquot_t *)dqp; + (h)->qh_version++; + (h)->qh_nelems++; +} +STATIC void +xfs_qm_dqtest_print( + xfs_dqtest_t *d) +{ + cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); + cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); + cmn_err(CE_DEBUG, "---- type = %s", XFS_QM_ISUDQ(d)? "USR" : "GRP"); + cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); + cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", + d->d_bcount, (int)d->d_bcount); + cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", + d->d_icount, (int)d->d_icount); + cmn_err(CE_DEBUG, "---------------------------"); +} + +STATIC void +xfs_qm_dqtest_failed( + xfs_dqtest_t *d, + xfs_dquot_t *dqp, + char *reason, + xfs_qcnt_t a, + xfs_qcnt_t b, + int error) +{ + qmtest_nfails++; + if (error) + cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", + INT_GET(d->d_id, ARCH_CONVERT), error, reason); + else + cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", + INT_GET(d->d_id, ARCH_CONVERT), reason, (int)a, (int)b); + xfs_qm_dqtest_print(d); + if (dqp) + xfs_qm_dqprint(dqp); +} + +STATIC int +xfs_dqtest_cmp2( + xfs_dqtest_t *d, + xfs_dquot_t *dqp) +{ + int err = 0; + if (INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) != d->d_icount) { + xfs_qm_dqtest_failed(d, dqp, "icount mismatch", + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), + d->d_icount, 0); + err++; + } + if (INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) != d->d_bcount) { + xfs_qm_dqtest_failed(d, dqp, "bcount mismatch", + INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), + d->d_bcount, 0); + err++; + } + if (INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) && + INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) >= + INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)) { + if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { + cmn_err(CE_DEBUG, + "%d [%s] [0x%p] BLK TIMER NOT STARTED", + d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); + err++; + } + } + if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) && + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= + INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) { + if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { + cmn_err(CE_DEBUG, + "%d [%s] [0x%p] INO TIMER NOT STARTED", + d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); + err++; + } + } +#ifdef QUOTADEBUG + if (!err) { + cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", + d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount); + } +#endif + return (err); +} + +STATIC void +xfs_dqtest_cmp( + xfs_dqtest_t *d) +{ + xfs_dquot_t *dqp; + int error; + + /* xfs_qm_dqtest_print(d); */ + if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0, + &dqp))) { + xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error); + return; + } + xfs_dqtest_cmp2(d, dqp); + xfs_qm_dqput(dqp); +} + +STATIC int +xfs_qm_internalqcheck_dqget( + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_dqtest_t **O_dq) +{ + xfs_dqtest_t *d; + xfs_dqhash_t *h; + + h = DQTEST_HASH(mp, id, type); + for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; + d = (xfs_dqtest_t *) d->HL_NEXT) { + /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ + if (d->d_id == id && mp == d->q_mount) { + *O_dq = d; + return (0); + } + } + d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP); + d->dq_flags = type; + d->d_id = id; + d->q_mount = mp; + d->q_hash = h; + xfs_qm_hashinsert(h, d); + *O_dq = d; + return (0); +} + +STATIC void +xfs_qm_internalqcheck_get_dquots( + xfs_mount_t *mp, + xfs_dqid_t uid, + xfs_dqid_t gid, + xfs_dqtest_t **ud, + xfs_dqtest_t **gd) +{ + if (XFS_IS_UQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud); + if (XFS_IS_GQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd); +} + + +STATIC void +xfs_qm_internalqcheck_dqadjust( + xfs_inode_t *ip, + xfs_dqtest_t *d) +{ + d->d_icount++; + d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks; +} + +STATIC int +xfs_qm_internalqcheck_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + void *private_data, /* not used */ + xfs_daddr_t bno, /* starting block of inode cluster */ + int *ubused, /* not used */ + void *dip, /* not used */ + int *res) /* bulkstat result code */ +{ + xfs_inode_t *ip; + xfs_dqtest_t *ud, *gd; + uint lock_flags; + boolean_t ipreleased; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + *res = BULKSTAT_RV_NOTHING; + qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", + (unsigned long long) ino, + (unsigned long long) mp->m_sb.sb_uquotino, + (unsigned long long) mp->m_sb.sb_gquotino); + return XFS_ERROR(EINVAL); + } + ipreleased = B_FALSE; + again: + lock_flags = XFS_ILOCK_SHARED; + if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) { + *res = BULKSTAT_RV_NOTHING; + return (error); + } + + if (ip->i_d.di_mode == 0) { + xfs_iput_new(ip, lock_flags); + *res = BULKSTAT_RV_NOTHING; + return XFS_ERROR(ENOENT); + } + + /* + * This inode can have blocks after eof which can get released + * when we send it to inactive. Since we don't check the dquot + * until the after all our calculations are done, we must get rid + * of those now. + */ + if (! ipreleased) { + xfs_iput(ip, lock_flags); + ipreleased = B_TRUE; + goto again; + } + xfs_qm_internalqcheck_get_dquots(mp, + (xfs_dqid_t) ip->i_d.di_uid, + (xfs_dqid_t) ip->i_d.di_gid, + &ud, &gd); + if (XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ud); + xfs_qm_internalqcheck_dqadjust(ip, ud); + } + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(gd); + xfs_qm_internalqcheck_dqadjust(ip, gd); + } + xfs_iput(ip, lock_flags); + *res = BULKSTAT_RV_DIDONE; + return (0); +} + + +/* PRIVATE, debugging */ +int +xfs_qm_internalqcheck( + xfs_mount_t *mp) +{ + xfs_ino_t lastino; + int done, count; + int i; + xfs_dqtest_t *d, *e; + xfs_dqhash_t *h1; + int error; + + lastino = 0; + qmtest_hashmask = 32; + count = 5; + done = 0; + qmtest_nfails = 0; + + if (! XFS_IS_QUOTA_ON(mp)) + return XFS_ERROR(ESRCH); + + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + XFS_bflush(mp->m_ddev_targp); + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + XFS_bflush(mp->m_ddev_targp); + + mutex_lock(&qcheck_lock, PINOD); + /* There should be absolutely no quota activity while this + is going on. */ + qmtest_udqtab = kmem_zalloc(qmtest_hashmask * + sizeof(xfs_dqhash_t), KM_SLEEP); + qmtest_gdqtab = kmem_zalloc(qmtest_hashmask * + sizeof(xfs_dqhash_t), KM_SLEEP); + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters + */ + if ((error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_internalqcheck_adjust, NULL, + 0, NULL, BULKSTAT_FG_IGET, &done))) { + break; + } + } while (! done); + if (error) { + cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); + } + cmn_err(CE_DEBUG, "Checking results against system dquots"); + for (i = 0; i < qmtest_hashmask; i++) { + h1 = &qmtest_udqtab[i]; + for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_cmp(d); + e = (xfs_dqtest_t *) d->HL_NEXT; + kmem_free(d, sizeof(xfs_dqtest_t)); + d = e; + } + h1 = &qmtest_gdqtab[i]; + for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_cmp(d); + e = (xfs_dqtest_t *) d->HL_NEXT; + kmem_free(d, sizeof(xfs_dqtest_t)); + d = e; + } + } + + if (qmtest_nfails) { + cmn_err(CE_DEBUG, "******** quotacheck failed ********"); + cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); + } else { + cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); + } + kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); + kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); + mutex_unlock(&qcheck_lock); + return (qmtest_nfails); +} + +#endif /* DEBUG */ diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h new file mode 100644 index 00000000000..414b6004af2 --- /dev/null +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_QUOTA_PRIV_H__ +#define __XFS_QUOTA_PRIV_H__ + +/* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +/* Number of dquots that fit in to a dquot block */ +#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) + +#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \ + MR_UPDATE | MR_ACCESS) != 0) +#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \ + MR_UPDATE) != 0) + +#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) + +#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) +#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) +#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) +#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) +#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) +#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) +#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) +#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) +#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) +#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) + +#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) +#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock) +#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) +#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) + +#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD)) +#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) +#ifdef DEBUG +struct xfs_dqhash; +static inline int XQMISLCKD(struct xfs_dqhash *h) +{ + if (mutex_trylock(&h->qh_lock)) { + mutex_unlock(&h->qh_lock); + return 0; + } + return 1; +} +#endif + +#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) +#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h) +#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h) + +#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp))) +#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp))) +#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp))) + +#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) +#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) +#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist)) + +/* + * Hash into a bucket in the dquot hash table, based on . + */ +#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ + (__psunsigned_t)(id)) & \ + (xfs_Gqm->qm_dqhashmask - 1)) +#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \ + (xfs_Gqm->qm_usr_dqhtable + \ + XFS_DQ_HASHVAL(mp, id)) : \ + (xfs_Gqm->qm_grp_dqhtable + \ + XFS_DQ_HASHVAL(mp, id))) +#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ + XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp)) +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +#define HL_PREVP dq_hashlist.ql_prevp +#define HL_NEXT dq_hashlist.ql_next +#define MPL_PREVP dq_mplist.ql_prevp +#define MPL_NEXT dq_mplist.ql_next + + +#define _LIST_REMOVE(h, dqp, PVP, NXT) \ + { \ + xfs_dquot_t *d; \ + if (((d) = (dqp)->NXT)) \ + (d)->PVP = (dqp)->PVP; \ + *((dqp)->PVP) = d; \ + (dqp)->NXT = NULL; \ + (dqp)->PVP = NULL; \ + (h)->qh_version++; \ + (h)->qh_nelems--; \ + } + +#define _LIST_INSERT(h, dqp, PVP, NXT) \ + { \ + xfs_dquot_t *d; \ + if (((d) = (h)->qh_next)) \ + (d)->PVP = &((dqp)->NXT); \ + (dqp)->NXT = d; \ + (dqp)->PVP = &((h)->qh_next); \ + (h)->qh_next = dqp; \ + (h)->qh_version++; \ + (h)->qh_nelems++; \ + } + +#define FOREACH_DQUOT_IN_MP(dqp, mp) \ + for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) + +#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ +for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ + (dqp) = (dqp)->dq_flnext) + +#define XQM_HASHLIST_INSERT(h, dqp) \ + _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) + +#define XQM_FREELIST_INSERT(h, dqp) \ + xfs_qm_freelist_append(h, dqp) + +#define XQM_MPLIST_INSERT(h, dqp) \ + _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) + +#define XQM_HASHLIST_REMOVE(h, dqp) \ + _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) +#define XQM_FREELIST_REMOVE(dqp) \ + xfs_qm_freelist_unlink(dqp) +#define XQM_MPLIST_REMOVE(h, dqp) \ + { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ + XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } + +#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) + +#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ + (tp)->t_dqinfo->dqa_usrdquots : \ + (tp)->t_dqinfo->dqa_grpdquots) +#define XFS_IS_SUSER_DQUOT(dqp) \ + (!((dqp)->q_core.d_id)) + +#define XFS_PURGE_INODE(ip) \ + { \ + vmap_t dqvmap; \ + vnode_t *dqvp; \ + dqvp = XFS_ITOV(ip); \ + VMAP(dqvp, dqvmap); \ + VN_RELE(dqvp); \ + } + +#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ + (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???")) +#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY") + +#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c new file mode 100644 index 00000000000..149b2a1fd94 --- /dev/null +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -0,0 +1,941 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" + +#include "xfs_qm.h" + +STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); + +/* + * Add the locked dquot to the transaction. + * The dquot must be locked, and it cannot be associated with any + * transaction. + */ +void +xfs_trans_dqjoin( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + xfs_dq_logitem_t *lp; + + ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); + lp = &dqp->q_logitem; + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); + + /* + * Initialize i_transp so we can later determine if this dquot is + * associated with this transaction. + */ + dqp->q_transp = tp; +} + + +/* + * This is called to mark the dquot as needing + * to be logged when the transaction is committed. The dquot must + * already be associated with the given transaction. + * Note that it marks the entire transaction as dirty. In the ordinary + * case, this gets called via xfs_trans_commit, after the transaction + * is already dirty. However, there's nothing stop this from getting + * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY + * flag. + */ +void +xfs_trans_log_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + xfs_log_item_desc_t *lidp; + + ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; +} + +/* + * Carry forward whatever is left of the quota blk reservation to + * the spanky new transaction + */ +STATIC void +xfs_trans_dup_dqinfo( + xfs_trans_t *otp, + xfs_trans_t *ntp) +{ + xfs_dqtrx_t *oq, *nq; + int i,j; + xfs_dqtrx_t *oqa, *nqa; + + if (!otp->t_dqinfo) + return; + + xfs_trans_alloc_dqinfo(ntp); + oqa = otp->t_dqinfo->dqa_usrdquots; + nqa = ntp->t_dqinfo->dqa_usrdquots; + + /* + * Because the quota blk reservation is carried forward, + * it is also necessary to carry forward the DQ_DIRTY flag. + */ + if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + ntp->t_flags |= XFS_TRANS_DQ_DIRTY; + + for (j = 0; j < 2; j++) { + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (oqa[i].qt_dquot == NULL) + break; + oq = &oqa[i]; + nq = &nqa[i]; + + nq->qt_dquot = oq->qt_dquot; + nq->qt_bcount_delta = nq->qt_icount_delta = 0; + nq->qt_rtbcount_delta = 0; + + /* + * Transfer whatever is left of the reservations. + */ + nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; + oq->qt_blk_res = oq->qt_blk_res_used; + + nq->qt_rtblk_res = oq->qt_rtblk_res - + oq->qt_rtblk_res_used; + oq->qt_rtblk_res = oq->qt_rtblk_res_used; + + nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used; + oq->qt_ino_res = oq->qt_ino_res_used; + + } + oqa = otp->t_dqinfo->dqa_grpdquots; + nqa = ntp->t_dqinfo->dqa_grpdquots; + } +} + +/* + * Wrap around mod_dquot to account for both user and group quotas. + */ +void +xfs_trans_mod_dquot_byino( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint field, + long delta) +{ + xfs_mount_t *mp; + + ASSERT(tp); + mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_ON(mp) || + ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) { + (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + } + if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) { + (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + } +} + +STATIC xfs_dqtrx_t * +xfs_trans_get_dqtrx( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + int i; + xfs_dqtrx_t *qa; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); + + if (qa[i].qt_dquot == NULL || + qa[i].qt_dquot == dqp) { + return (&qa[i]); + } + } + + return (NULL); +} + +/* + * Make the changes in the transaction structure. + * The moral equivalent to xfs_trans_mod_sb(). + * We don't touch any fields in the dquot, so we don't care + * if it's locked or not (most of the time it won't be). + */ +void +xfs_trans_mod_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp, + uint field, + long delta) +{ + xfs_dqtrx_t *qtrx; + + ASSERT(tp); + qtrx = NULL; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + /* + * Find either the first free slot or the slot that belongs + * to this dquot. + */ + qtrx = xfs_trans_get_dqtrx(tp, dqp); + ASSERT(qtrx); + if (qtrx->qt_dquot == NULL) + qtrx->qt_dquot = dqp; + + switch (field) { + + /* + * regular disk blk reservation + */ + case XFS_TRANS_DQ_RES_BLKS: + qtrx->qt_blk_res += (ulong)delta; + break; + + /* + * inode reservation + */ + case XFS_TRANS_DQ_RES_INOS: + qtrx->qt_ino_res += (ulong)delta; + break; + + /* + * disk blocks used. + */ + case XFS_TRANS_DQ_BCOUNT: + if (qtrx->qt_blk_res && delta > 0) { + qtrx->qt_blk_res_used += (ulong)delta; + ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); + } + qtrx->qt_bcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELBCOUNT: + qtrx->qt_delbcnt_delta += delta; + break; + + /* + * Inode Count + */ + case XFS_TRANS_DQ_ICOUNT: + if (qtrx->qt_ino_res && delta > 0) { + qtrx->qt_ino_res_used += (ulong)delta; + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + } + qtrx->qt_icount_delta += delta; + break; + + /* + * rtblk reservation + */ + case XFS_TRANS_DQ_RES_RTBLKS: + qtrx->qt_rtblk_res += (ulong)delta; + break; + + /* + * rtblk count + */ + case XFS_TRANS_DQ_RTBCOUNT: + if (qtrx->qt_rtblk_res && delta > 0) { + qtrx->qt_rtblk_res_used += (ulong)delta; + ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); + } + qtrx->qt_rtbcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELRTBCOUNT: + qtrx->qt_delrtb_delta += delta; + break; + + default: + ASSERT(0); + } + tp->t_flags |= XFS_TRANS_DQ_DIRTY; +} + + +/* + * Given an array of dqtrx structures, lock all the dquots associated + * and join them to the transaction, provided they have been modified. + * We know that the highest number of dquots (of one type - usr OR grp), + * involved in a transaction is 2 and that both usr and grp combined - 3. + * So, we don't attempt to make this very generic. + */ +STATIC void +xfs_trans_dqlockedjoin( + xfs_trans_t *tp, + xfs_dqtrx_t *q) +{ + ASSERT(q[0].qt_dquot != NULL); + if (q[1].qt_dquot == NULL) { + xfs_dqlock(q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + } else { + ASSERT(XFS_QM_TRANS_MAXDQS == 2); + xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[1].qt_dquot); + } +} + + +/* + * Called by xfs_trans_commit() and similar in spirit to + * xfs_trans_apply_sb_deltas(). + * Go thru all the dquots belonging to this transaction and modify the + * INCORE dquot to reflect the actual usages. + * Unreserve just the reservations done by this transaction. + * dquot is still left locked at exit. + */ +void +xfs_trans_apply_dquot_deltas( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + xfs_disk_dquot_t *d; + long totalbdelta; + long totalrtbdelta; + + if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + ASSERT(tp->t_dqinfo); + qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < 2; j++) { + if (qa[0].qt_dquot == NULL) { + qa = tp->t_dqinfo->dqa_grpdquots; + continue; + } + + /* + * Lock all of the dquots and join them to the transaction. + */ + xfs_trans_dqlockedjoin(tp, qa); + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * The array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + + /* + * adjust the actual number of blocks used + */ + d = &dqp->q_core; + + /* + * The issue here is - sometimes we don't make a blkquota + * reservation intentionally to be fair to users + * (when the amount is small). On the other hand, + * delayed allocs do make reservations, but that's + * outside of a transaction, so we have no + * idea how much was really reserved. + * So, here we've accumulated delayed allocation blks and + * non-delay blks. The assumption is that the + * delayed ones are always reserved (outside of a + * transaction), and the others may or may not have + * quota reservations. + */ + totalbdelta = qtrx->qt_bcount_delta + + qtrx->qt_delbcnt_delta; + totalrtbdelta = qtrx->qt_rtbcount_delta + + qtrx->qt_delrtb_delta; +#ifdef QUOTADEBUG + if (totalbdelta < 0) + ASSERT(INT_GET(d->d_bcount, ARCH_CONVERT) >= + (xfs_qcnt_t) -totalbdelta); + + if (totalrtbdelta < 0) + ASSERT(INT_GET(d->d_rtbcount, ARCH_CONVERT) >= + (xfs_qcnt_t) -totalrtbdelta); + + if (qtrx->qt_icount_delta < 0) + ASSERT(INT_GET(d->d_icount, ARCH_CONVERT) >= + (xfs_qcnt_t) -qtrx->qt_icount_delta); +#endif + if (totalbdelta) + INT_MOD(d->d_bcount, ARCH_CONVERT, (xfs_qcnt_t)totalbdelta); + + if (qtrx->qt_icount_delta) + INT_MOD(d->d_icount, ARCH_CONVERT, (xfs_qcnt_t)qtrx->qt_icount_delta); + + if (totalrtbdelta) + INT_MOD(d->d_rtbcount, ARCH_CONVERT, (xfs_qcnt_t)totalrtbdelta); + + /* + * Get any default limits in use. + * Start/reset the timer(s) if needed. + */ + if (d->d_id) { + xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqtimers(tp->t_mountp, d); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + /* + * add this to the list of items to get logged + */ + xfs_trans_log_dquot(tp, dqp); + /* + * Take off what's left of the original reservation. + * In case of delayed allocations, there's no + * reservation that a transaction structure knows of. + */ + if (qtrx->qt_blk_res != 0) { + if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_blk_res > + qtrx->qt_blk_res_used) + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res - + qtrx->qt_blk_res_used); + else + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res_used - + qtrx->qt_blk_res); + } + } else { + /* + * These blks were never reserved, either inside + * a transaction or outside one (in a delayed + * allocation). Also, this isn't always a + * negative number since we sometimes + * deliberately skip quota reservations. + */ + if (qtrx->qt_bcount_delta) { + dqp->q_res_bcount += + (xfs_qcnt_t)qtrx->qt_bcount_delta; + } + } + /* + * Adjust the RT reservation. + */ + if (qtrx->qt_rtblk_res != 0) { + if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_rtblk_res > + qtrx->qt_rtblk_res_used) + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res - + qtrx->qt_rtblk_res_used); + else + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res_used - + qtrx->qt_rtblk_res); + } + } else { + if (qtrx->qt_rtbcount_delta) + dqp->q_res_rtbcount += + (xfs_qcnt_t)qtrx->qt_rtbcount_delta; + } + + /* + * Adjust the inode reservation. + */ + if (qtrx->qt_ino_res != 0) { + ASSERT(qtrx->qt_ino_res >= + qtrx->qt_ino_res_used); + if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) + dqp->q_res_icount -= (xfs_qcnt_t) + (qtrx->qt_ino_res - + qtrx->qt_ino_res_used); + } else { + if (qtrx->qt_icount_delta) + dqp->q_res_icount += + (xfs_qcnt_t)qtrx->qt_icount_delta; + } + + +#ifdef QUOTADEBUG + if (qtrx->qt_rtblk_res != 0) + cmn_err(CE_DEBUG, "RT res %d for 0x%p\n", + (int) qtrx->qt_rtblk_res, dqp); +#endif + ASSERT(dqp->q_res_bcount >= + INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT)); + ASSERT(dqp->q_res_icount >= + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT)); + ASSERT(dqp->q_res_rtbcount >= + INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT)); + } + /* + * Do the group quotas next + */ + qa = tp->t_dqinfo->dqa_grpdquots; + } +} + +/* + * Release the reservations, and adjust the dquots accordingly. + * This is called only when the transaction is being aborted. If by + * any chance we have done dquot modifications incore (ie. deltas) already, + * we simply throw those away, since that's the expected behavior + * when a transaction is curtailed without a commit. + */ +STATIC void +xfs_trans_unreserve_and_mod_dquots( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + boolean_t locked; + + if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + qa = tp->t_dqinfo->dqa_usrdquots; + + for (j = 0; j < 2; j++) { + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * We assume that the array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + /* + * Unreserve the original reservation. We don't care + * about the number of blocks used field, or deltas. + * Also we don't bother to zero the fields. + */ + locked = B_FALSE; + if (qtrx->qt_blk_res) { + xfs_dqlock(dqp); + locked = B_TRUE; + dqp->q_res_bcount -= + (xfs_qcnt_t)qtrx->qt_blk_res; + } + if (qtrx->qt_ino_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = B_TRUE; + } + dqp->q_res_icount -= + (xfs_qcnt_t)qtrx->qt_ino_res; + } + + if (qtrx->qt_rtblk_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = B_TRUE; + } + dqp->q_res_rtbcount -= + (xfs_qcnt_t)qtrx->qt_rtblk_res; + } + if (locked) + xfs_dqunlock(dqp); + + } + qa = tp->t_dqinfo->dqa_grpdquots; + } +} + +/* + * This reserves disk blocks and inodes against a dquot. + * Flags indicate if the dquot is to be locked here and also + * if the blk reservation is for RT or regular blocks. + * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. + * Returns EDQUOT if quota is exceeded. + */ +STATIC int +xfs_trans_dqresv( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + long nblks, + long ninos, + uint flags) +{ + int error; + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time_t btimer; + xfs_qcnt_t *resbcountp; + xfs_quotainfo_t *q = mp->m_quotainfo; + + if (! (flags & XFS_QMOPT_DQLOCK)) { + xfs_dqlock(dqp); + } + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (flags & XFS_TRANS_DQ_RES_BLKS) { + hardlimit = INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT); + if (!hardlimit) + hardlimit = q->qi_bhardlimit; + softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT); + if (!softlimit) + softlimit = q->qi_bsoftlimit; + btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT); + resbcountp = &dqp->q_res_bcount; + } else { + ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); + hardlimit = INT_GET(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT); + if (!hardlimit) + hardlimit = q->qi_rtbhardlimit; + softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT); + if (!softlimit) + softlimit = q->qi_rtbsoftlimit; + btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT); + resbcountp = &dqp->q_res_rtbcount; + } + error = 0; + + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && + dqp->q_core.d_id && + XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) { +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" + " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); +#endif + if (nblks > 0) { + /* + * dquot is locked already. See if we'd go over the + * hardlimit or exceed the timelimit if we allocate + * nblks. + */ + if (hardlimit > 0ULL && + (hardlimit <= nblks + *resbcountp)) { + error = EDQUOT; + goto error_return; + } + + if (softlimit > 0ULL && + (softlimit <= nblks + *resbcountp)) { + /* + * If timer or warnings has expired, + * return EDQUOT + */ + if ((btimer != 0 && get_seconds() > btimer) || + (dqp->q_core.d_bwarns && + INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >= + XFS_QI_BWARNLIMIT(dqp->q_mount))) { + error = EDQUOT; + goto error_return; + } + } + } + if (ninos > 0) { + hardlimit = INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT); + if (!hardlimit) + hardlimit = q->qi_ihardlimit; + softlimit = INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT); + if (!softlimit) + softlimit = q->qi_isoftlimit; + if (hardlimit > 0ULL && + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= hardlimit) { + error = EDQUOT; + goto error_return; + } else if (softlimit > 0ULL && + INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= softlimit) { + /* + * If timer or warnings has expired, + * return EDQUOT + */ + if ((dqp->q_core.d_itimer && + get_seconds() > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) || + (dqp->q_core.d_iwarns && + INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >= + XFS_QI_IWARNLIMIT(dqp->q_mount))) { + error = EDQUOT; + goto error_return; + } + } + } + } + + /* + * Change the reservation, but not the actual usage. + * Note that q_res_bcount = q_core.d_bcount + resv + */ + (*resbcountp) += (xfs_qcnt_t)nblks; + if (ninos != 0) + dqp->q_res_icount += (xfs_qcnt_t)ninos; + + /* + * note the reservation amt in the trans struct too, + * so that the transaction knows how much was reserved by + * it against this particular dquot. + * We don't do this when we are reserving for a delayed allocation, + * because we don't have the luxury of a transaction envelope then. + */ + if (tp) { + ASSERT(tp->t_dqinfo); + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + if (nblks != 0) + xfs_trans_mod_dquot(tp, dqp, + flags & XFS_QMOPT_RESBLK_MASK, + nblks); + if (ninos != 0) + xfs_trans_mod_dquot(tp, dqp, + XFS_TRANS_DQ_RES_INOS, + ninos); + } + ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT)); + ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT)); + ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT)); + +error_return: + if (! (flags & XFS_QMOPT_DQLOCK)) { + xfs_dqunlock(dqp); + } + return (error); +} + + +/* + * Given a dquot(s), make disk block and/or inode reservations against them. + * The fact that this does the reservation against both the usr and + * grp quotas is important, because this follows a both-or-nothing + * approach. + * + * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. + * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks + * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks + * dquots are unlocked on return, if they were not locked by caller. + */ +int +xfs_trans_reserve_quota_bydquots( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *udqp, + xfs_dquot_t *gdqp, + long nblks, + long ninos, + uint flags) +{ + int resvd; + + if (! XFS_IS_QUOTA_ON(mp)) + return (0); + + if (tp && tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + resvd = 0; + + if (udqp) { + if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags)) + return (EDQUOT); + resvd = 1; + } + + if (gdqp) { + if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) { + /* + * can't do it, so backout previous reservation + */ + if (resvd) { + flags |= XFS_QMOPT_FORCE_RES; + xfs_trans_dqresv(tp, mp, udqp, + -nblks, -ninos, flags); + } + return (EDQUOT); + } + } + + /* + * Didnt change anything critical, so, no need to log + */ + return (0); +} + + +/* + * Lock the dquot and change the reservation if we can. + * This doesn't change the actual usage, just the reservation. + * The inode sent in is locked. + * + * Returns 0 on success, EDQUOT or other errors otherwise + */ +STATIC int +xfs_trans_reserve_quota_nblks( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_inode_t *ip, + long nblks, + long ninos, + uint type) +{ + int error; + + if (!XFS_IS_QUOTA_ON(mp)) + return (0); + + ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); + ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS || + (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS); + + /* + * Reserve nblks against these dquots, with trans as the mediator. + */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, + ip->i_udquot, ip->i_gdquot, + nblks, ninos, + type); + return (error); +} + +/* + * This routine is called to allocate a quotaoff log item. + */ +xfs_qoff_logitem_t * +xfs_trans_get_qoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_qoff_logitem_t *q; + + ASSERT(tp != NULL); + + q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); + ASSERT(q != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); + + return (q); +} + + +/* + * This is called to mark the quotaoff logitem as needing + * to be logged when the transaction is committed. The logitem must + * already be associated with the given transaction. + */ +void +xfs_trans_log_quotaoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *qlp) +{ + xfs_log_item_desc_t *lidp; + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; +} + +STATIC void +xfs_trans_alloc_dqinfo( + xfs_trans_t *tp) +{ + (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); +} + +STATIC void +xfs_trans_free_dqinfo( + xfs_trans_t *tp) +{ + if (!tp->t_dqinfo) + return; + kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo); + (tp)->t_dqinfo = NULL; +} + +xfs_dqtrxops_t xfs_trans_dquot_ops = { + .qo_dup_dqinfo = xfs_trans_dup_dqinfo, + .qo_free_dqinfo = xfs_trans_free_dqinfo, + .qo_mod_dquot_byino = xfs_trans_mod_dquot_byino, + .qo_apply_dquot_deltas = xfs_trans_apply_dquot_deltas, + .qo_reserve_quota_nblks = xfs_trans_reserve_quota_nblks, + .qo_reserve_quota_bydquots = xfs_trans_reserve_quota_bydquots, + .qo_unreserve_and_mod_dquots = xfs_trans_unreserve_and_mod_dquots, +}; diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c new file mode 100644 index 00000000000..7d6e1f37df1 --- /dev/null +++ b/fs/xfs/support/debug.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "debug.h" + +#include +#include +#include + +int doass = 1; +static char message[256]; /* keep it off the stack */ +static DEFINE_SPINLOCK(xfs_err_lock); + +/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ +#define XFS_MAX_ERR_LEVEL 7 +#define XFS_ERR_MASK ((1 << 3) - 1) +static char *err_level[XFS_MAX_ERR_LEVEL+1] = + {KERN_EMERG, KERN_ALERT, KERN_CRIT, + KERN_ERR, KERN_WARNING, KERN_NOTICE, + KERN_INFO, KERN_DEBUG}; + +void +assfail(char *a, char *f, int l) +{ + printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l); + BUG(); +} + +#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM)) + +unsigned long +random(void) +{ + static unsigned long RandomValue = 1; + /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ + register long rv = RandomValue; + register long lo; + register long hi; + + hi = rv / 127773; + lo = rv % 127773; + rv = 16807 * lo - 2836 * hi; + if( rv <= 0 ) rv += 2147483647; + return( RandomValue = rv ); +} + +int +get_thread_id(void) +{ + return current->pid; +} + +#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */ + +void +cmn_err(register int level, char *fmt, ...) +{ + char *fp = fmt; + int len; + ulong flags; + va_list ap; + + level &= XFS_ERR_MASK; + if (level > XFS_MAX_ERR_LEVEL) + level = XFS_MAX_ERR_LEVEL; + spin_lock_irqsave(&xfs_err_lock,flags); + va_start(ap, fmt); + if (*fmt == '!') fp++; + len = vsprintf(message, fp, ap); + if (message[len-1] != '\n') + strcat(message, "\n"); + printk("%s%s", err_level[level], message); + va_end(ap); + spin_unlock_irqrestore(&xfs_err_lock,flags); + + if (level == CE_PANIC) + BUG(); +} + + +void +icmn_err(register int level, char *fmt, va_list ap) +{ + ulong flags; + int len; + + level &= XFS_ERR_MASK; + if(level > XFS_MAX_ERR_LEVEL) + level = XFS_MAX_ERR_LEVEL; + spin_lock_irqsave(&xfs_err_lock,flags); + len = vsprintf(message, fmt, ap); + if (message[len-1] != '\n') + strcat(message, "\n"); + spin_unlock_irqrestore(&xfs_err_lock,flags); + printk("%s%s", err_level[level], message); + if (level == CE_PANIC) + BUG(); +} diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h new file mode 100644 index 00000000000..40b0f4c54d9 --- /dev/null +++ b/fs/xfs/support/debug.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_DEBUG_H__ +#define __XFS_SUPPORT_DEBUG_H__ + +#include + +#define CE_DEBUG 7 /* debug */ +#define CE_CONT 6 /* continuation */ +#define CE_NOTE 5 /* notice */ +#define CE_WARN 4 /* warning */ +#define CE_ALERT 1 /* alert */ +#define CE_PANIC 0 /* panic */ + +extern void icmn_err(int, char *, va_list); +/* PRINTFLIKE2 */ +extern void cmn_err(int, char *, ...); + +#ifndef STATIC +# define STATIC static +#endif + +#ifdef DEBUG +# ifdef lint +# define ASSERT(EX) ((void)0) /* avoid "constant in conditional" babble */ +# else +# define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__)) +# endif /* lint */ +#else +# define ASSERT(x) ((void)0) +#endif + +extern int doass; /* dynamically turn off asserts */ +extern void assfail(char *, char *, int); +#ifdef DEBUG +extern unsigned long random(void); +extern int get_thread_id(void); +#endif + +#define ASSERT_ALWAYS(EX) ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__)) +#define debug_stop_all_cpus(param) /* param is "cpumask_t *" */ + +#endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c new file mode 100644 index 00000000000..3dae14c8c55 --- /dev/null +++ b/fs/xfs/support/ktrace.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +static kmem_zone_t *ktrace_hdr_zone; +static kmem_zone_t *ktrace_ent_zone; +static int ktrace_zentries; + +void +ktrace_init(int zentries) +{ + ktrace_zentries = zentries; + + ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), + "ktrace_hdr"); + ASSERT(ktrace_hdr_zone); + + ktrace_ent_zone = kmem_zone_init(ktrace_zentries + * sizeof(ktrace_entry_t), + "ktrace_ent"); + ASSERT(ktrace_ent_zone); +} + +void +ktrace_uninit(void) +{ + kmem_cache_destroy(ktrace_hdr_zone); + kmem_cache_destroy(ktrace_ent_zone); +} + +/* + * ktrace_alloc() + * + * Allocate a ktrace header and enough buffering for the given + * number of entries. + */ +ktrace_t * +ktrace_alloc(int nentries, int sleep) +{ + ktrace_t *ktp; + ktrace_entry_t *ktep; + + ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); + + if (ktp == (ktrace_t*)NULL) { + /* + * KM_SLEEP callers don't expect failure. + */ + if (sleep & KM_SLEEP) + panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); + + return NULL; + } + + /* + * Special treatment for buffers with the ktrace_zentries entries + */ + if (nentries == ktrace_zentries) { + ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, + sleep); + } else { + ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), + sleep); + } + + if (ktep == NULL) { + /* + * KM_SLEEP callers don't expect failure. + */ + if (sleep & KM_SLEEP) + panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); + + kmem_free(ktp, sizeof(*ktp)); + + return NULL; + } + + spinlock_init(&(ktp->kt_lock), "kt_lock"); + + ktp->kt_entries = ktep; + ktp->kt_nentries = nentries; + ktp->kt_index = 0; + ktp->kt_rollover = 0; + return ktp; +} + + +/* + * ktrace_free() + * + * Free up the ktrace header and buffer. It is up to the caller + * to ensure that no-one is referencing it. + */ +void +ktrace_free(ktrace_t *ktp) +{ + int entries_size; + + if (ktp == (ktrace_t *)NULL) + return; + + spinlock_destroy(&ktp->kt_lock); + + /* + * Special treatment for the Vnode trace buffer. + */ + if (ktp->kt_nentries == ktrace_zentries) { + kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); + } else { + entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); + + kmem_free(ktp->kt_entries, entries_size); + } + + kmem_zone_free(ktrace_hdr_zone, ktp); +} + + +/* + * Enter the given values into the "next" entry in the trace buffer. + * kt_index is always the index of the next entry to be filled. + */ +void +ktrace_enter( + ktrace_t *ktp, + void *val0, + void *val1, + void *val2, + void *val3, + void *val4, + void *val5, + void *val6, + void *val7, + void *val8, + void *val9, + void *val10, + void *val11, + void *val12, + void *val13, + void *val14, + void *val15) +{ + static lock_t wrap_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + int index; + ktrace_entry_t *ktep; + + ASSERT(ktp != NULL); + + /* + * Grab an entry by pushing the index up to the next one. + */ + spin_lock_irqsave(&wrap_lock, flags); + index = ktp->kt_index; + if (++ktp->kt_index == ktp->kt_nentries) + ktp->kt_index = 0; + spin_unlock_irqrestore(&wrap_lock, flags); + + if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) + ktp->kt_rollover = 1; + + ASSERT((index >= 0) && (index < ktp->kt_nentries)); + + ktep = &(ktp->kt_entries[index]); + + ktep->val[0] = val0; + ktep->val[1] = val1; + ktep->val[2] = val2; + ktep->val[3] = val3; + ktep->val[4] = val4; + ktep->val[5] = val5; + ktep->val[6] = val6; + ktep->val[7] = val7; + ktep->val[8] = val8; + ktep->val[9] = val9; + ktep->val[10] = val10; + ktep->val[11] = val11; + ktep->val[12] = val12; + ktep->val[13] = val13; + ktep->val[14] = val14; + ktep->val[15] = val15; +} + +/* + * Return the number of entries in the trace buffer. + */ +int +ktrace_nentries( + ktrace_t *ktp) +{ + if (ktp == NULL) { + return 0; + } + + return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); +} + +/* + * ktrace_first() + * + * This is used to find the start of the trace buffer. + * In conjunction with ktrace_next() it can be used to + * iterate through the entire trace buffer. This code does + * not do any locking because it is assumed that it is called + * from the debugger. + * + * The caller must pass in a pointer to a ktrace_snap + * structure in which we will keep some state used to + * iterate through the buffer. This state must not touched + * by any code outside of this module. + */ +ktrace_entry_t * +ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) +{ + ktrace_entry_t *ktep; + int index; + int nentries; + + if (ktp->kt_rollover) + index = ktp->kt_index; + else + index = 0; + + ktsp->ks_start = index; + ktep = &(ktp->kt_entries[index]); + + nentries = ktrace_nentries(ktp); + index++; + if (index < nentries) { + ktsp->ks_index = index; + } else { + ktsp->ks_index = 0; + if (index > nentries) + ktep = NULL; + } + return ktep; +} + +/* + * ktrace_next() + * + * This is used to iterate through the entries of the given + * trace buffer. The caller must pass in the ktrace_snap_t + * structure initialized by ktrace_first(). The return value + * will be either a pointer to the next ktrace_entry or NULL + * if all of the entries have been traversed. + */ +ktrace_entry_t * +ktrace_next( + ktrace_t *ktp, + ktrace_snap_t *ktsp) +{ + int index; + ktrace_entry_t *ktep; + + index = ktsp->ks_index; + if (index == ktsp->ks_start) { + ktep = NULL; + } else { + ktep = &ktp->kt_entries[index]; + } + + index++; + if (index == ktrace_nentries(ktp)) { + ktsp->ks_index = 0; + } else { + ktsp->ks_index = index; + } + + return ktep; +} + +/* + * ktrace_skip() + * + * Skip the next "count" entries and return the entry after that. + * Return NULL if this causes us to iterate past the beginning again. + */ +ktrace_entry_t * +ktrace_skip( + ktrace_t *ktp, + int count, + ktrace_snap_t *ktsp) +{ + int index; + int new_index; + ktrace_entry_t *ktep; + int nentries = ktrace_nentries(ktp); + + index = ktsp->ks_index; + new_index = index + count; + while (new_index >= nentries) { + new_index -= nentries; + } + if (index == ktsp->ks_start) { + /* + * We've iterated around to the start, so we're done. + */ + ktep = NULL; + } else if ((new_index < index) && (index < ktsp->ks_index)) { + /* + * We've skipped past the start again, so we're done. + */ + ktep = NULL; + ktsp->ks_index = ktsp->ks_start; + } else { + ktep = &(ktp->kt_entries[new_index]); + new_index++; + if (new_index == nentries) { + ktsp->ks_index = 0; + } else { + ktsp->ks_index = new_index; + } + } + return ktep; +} diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h new file mode 100644 index 00000000000..92d1a1a5d04 --- /dev/null +++ b/fs/xfs/support/ktrace.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KTRACE_H__ +#define __XFS_SUPPORT_KTRACE_H__ + +#include + +/* + * Trace buffer entry structure. + */ +typedef struct ktrace_entry { + void *val[16]; +} ktrace_entry_t; + +/* + * Trace buffer header structure. + */ +typedef struct ktrace { + lock_t kt_lock; /* mutex to guard counters */ + int kt_nentries; /* number of entries in trace buf */ + int kt_index; /* current index in entries */ + int kt_rollover; + ktrace_entry_t *kt_entries; /* buffer of entries */ +} ktrace_t; + +/* + * Trace buffer snapshot structure. + */ +typedef struct ktrace_snap { + int ks_start; /* kt_index at time of snap */ + int ks_index; /* current index */ +} ktrace_snap_t; + + +#ifdef CONFIG_XFS_TRACE + +extern void ktrace_init(int zentries); +extern void ktrace_uninit(void); + +extern ktrace_t *ktrace_alloc(int, int); +extern void ktrace_free(ktrace_t *); + +extern void ktrace_enter( + ktrace_t *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *, + void *); + +extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *); +extern int ktrace_nentries(ktrace_t *); +extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *); +extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *); + +#else +#define ktrace_init(x) do { } while (0) +#define ktrace_uninit() do { } while (0) +#endif /* CONFIG_XFS_TRACE */ + +#endif /* __XFS_SUPPORT_KTRACE_H__ */ diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c new file mode 100644 index 00000000000..15b5194f16b --- /dev/null +++ b/fs/xfs/support/move.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +/* Read from kernel buffer at src to user/kernel buffer defined + * by the uio structure. Advance the pointer in the uio struct + * as we go. + */ +int +uio_read(caddr_t src, size_t len, struct uio *uio) +{ + size_t count; + + if (!len || !uio->uio_resid) + return 0; + + count = uio->uio_iov->iov_len; + if (!count) + return 0; + if (count > len) + count = len; + + if (uio->uio_segflg == UIO_USERSPACE) { + if (copy_to_user(uio->uio_iov->iov_base, src, count)) + return EFAULT; + } else { + ASSERT(uio->uio_segflg == UIO_SYSSPACE); + memcpy(uio->uio_iov->iov_base, src, count); + } + + uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count); + uio->uio_iov->iov_len -= count; + uio->uio_offset += count; + uio->uio_resid -= count; + return 0; +} diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h new file mode 100644 index 00000000000..3d406dc1c89 --- /dev/null +++ b/fs/xfs/support/move.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + * Portions Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef __XFS_SUPPORT_MOVE_H__ +#define __XFS_SUPPORT_MOVE_H__ + +#include +#include + +/* Segment flag values. */ +enum uio_seg { + UIO_USERSPACE, /* from user data space */ + UIO_SYSSPACE, /* from system space */ +}; + +struct uio { + struct iovec *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs in array */ + xfs_off_t uio_offset; /* offset in file this uio corresponds to */ + int uio_resid; /* residual i/o count */ + enum uio_seg uio_segflg; /* see above */ +}; + +typedef struct uio uio_t; +typedef struct iovec iovec_t; + +extern int uio_read (caddr_t, size_t, uio_t *); + +#endif /* __XFS_SUPPORT_MOVE_H__ */ diff --git a/fs/xfs/support/qsort.c b/fs/xfs/support/qsort.c new file mode 100644 index 00000000000..1ec824140cf --- /dev/null +++ b/fs/xfs/support/qsort.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +/* + * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". + */ +#define swapcode(TYPE, parmi, parmj, n) { \ + long i = (n) / sizeof (TYPE); \ + register TYPE *pi = (TYPE *) (parmi); \ + register TYPE *pj = (TYPE *) (parmj); \ + do { \ + register TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} + +#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ + es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; + +static __inline void +swapfunc(char *a, char *b, int n, int swaptype) +{ + if (swaptype <= 1) + swapcode(long, a, b, n) + else + swapcode(char, a, b, n) +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(a); \ + *(long *)(a) = *(long *)(b); \ + *(long *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) + +static __inline char * +med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *)) +{ + return cmp(a, b) < 0 ? + (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a )) + :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); +} + +void +qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *)) +{ + char *pa, *pb, *pc, *pd, *pl, *pm, *pn; + int d, r, swaptype, swap_cnt; + register char *a = aa; + +loop: SWAPINIT(a, es); + swap_cnt = 0; + if (n < 7) { + for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + pm = (char *)a + (n / 2) * es; + if (n > 7) { + pl = (char *)a; + pn = (char *)a + (n - 1) * es; + if (n > 40) { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, cmp); + pm = med3(pm - d, pm, pm + d, cmp); + pn = med3(pn - 2 * d, pn - d, pn, cmp); + } + pm = med3(pl, pm, pn, cmp); + } + swap(a, pm); + pa = pb = (char *)a + es; + + pc = pd = (char *)a + (n - 1) * es; + for (;;) { + while (pb <= pc && (r = cmp(pb, a)) <= 0) { + if (r == 0) { + swap_cnt = 1; + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (r = cmp(pc, a)) >= 0) { + if (r == 0) { + swap_cnt = 1; + swap(pc, pd); + pd -= es; + } + pc -= es; + } + if (pb > pc) + break; + swap(pb, pc); + swap_cnt = 1; + pb += es; + pc -= es; + } + if (swap_cnt == 0) { /* Switch to insertion sort */ + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + + pn = (char *)a + n * es; + r = min(pa - (char *)a, pb - pa); + vecswap(a, pb - r, r); + r = min((long)(pd - pc), (long)(pn - pd - es)); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) + qsort(a, r / es, es, cmp); + if ((r = pd - pc) > es) { + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + goto loop; + } +/* qsort(pn - r, r / es, es, cmp);*/ +} diff --git a/fs/xfs/support/qsort.h b/fs/xfs/support/qsort.h new file mode 100644 index 00000000000..94263106d71 --- /dev/null +++ b/fs/xfs/support/qsort.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef QSORT_H +#define QSORT_H + +extern void qsort (void *const pbase, + size_t total_elems, + size_t size, + int (*cmp)(const void *, const void *)); + +#endif diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c new file mode 100644 index 00000000000..81f40cfcb26 --- /dev/null +++ b/fs/xfs/support/uuid.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include + +static mutex_t uuid_monitor; +static int uuid_table_size; +static uuid_t *uuid_table; + +void +uuid_init(void) +{ + mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor"); +} + +/* + * uuid_getnodeuniq - obtain the node unique fields of a UUID. + * + * This is not in any way a standard or condoned UUID function; + * it just something that's needed for user-level file handles. + */ +void +uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) +{ + char *uu = (char *)uuid; + + /* on IRIX, this function assumes big-endian fields within + * the uuid, so we use INT_GET to get the same result on + * little-endian systems + */ + + fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) + + INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT); + fsid[1] = INT_GET(*(u_int32_t*)(uu ), ARCH_CONVERT); +} + +void +uuid_create_nil(uuid_t *uuid) +{ + memset(uuid, 0, sizeof(*uuid)); +} + +int +uuid_is_nil(uuid_t *uuid) +{ + int i; + char *cp = (char *)uuid; + + if (uuid == NULL) + return 0; + /* implied check of version number here... */ + for (i = 0; i < sizeof *uuid; i++) + if (*cp++) return 0; /* not nil */ + return 1; /* is nil */ +} + +int +uuid_equal(uuid_t *uuid1, uuid_t *uuid2) +{ + return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; +} + +/* + * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom + * 64-bit words. NOTE: This function can not be changed EVER. Although + * brain-dead, some applications depend on this 64-bit value remaining + * persistent. Specifically, DMI vendors store the value as a persistent + * filehandle. + */ +__uint64_t +uuid_hash64(uuid_t *uuid) +{ + __uint64_t *sp = (__uint64_t *)uuid; + + return sp[0] + sp[1]; +} + +int +uuid_table_insert(uuid_t *uuid) +{ + int i, hole; + + mutex_lock(&uuid_monitor, PVFS); + for (i = 0, hole = -1; i < uuid_table_size; i++) { + if (uuid_is_nil(&uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &uuid_table[i])) { + mutex_unlock(&uuid_monitor); + return 0; + } + } + if (hole < 0) { + uuid_table = kmem_realloc(uuid_table, + (uuid_table_size + 1) * sizeof(*uuid_table), + uuid_table_size * sizeof(*uuid_table), + KM_SLEEP); + hole = uuid_table_size++; + } + uuid_table[hole] = *uuid; + mutex_unlock(&uuid_monitor); + return 1; +} + +void +uuid_table_remove(uuid_t *uuid) +{ + int i; + + mutex_lock(&uuid_monitor, PVFS); + for (i = 0; i < uuid_table_size; i++) { + if (uuid_is_nil(&uuid_table[i])) + continue; + if (!uuid_equal(uuid, &uuid_table[i])) + continue; + uuid_create_nil(&uuid_table[i]); + break; + } + ASSERT(i < uuid_table_size); + mutex_unlock(&uuid_monitor); +} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h new file mode 100644 index 00000000000..5220ea58ba2 --- /dev/null +++ b/fs/xfs/support/uuid.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_UUID_H__ +#define __XFS_SUPPORT_UUID_H__ + +typedef struct { + unsigned char __u_bits[16]; +} uuid_t; + +void uuid_init(void); +void uuid_create_nil(uuid_t *uuid); +int uuid_is_nil(uuid_t *uuid); +int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); +void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +__uint64_t uuid_hash64(uuid_t *uuid); +int uuid_table_insert(uuid_t *uuid); +void uuid_table_remove(uuid_t *uuid); + +#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h new file mode 100644 index 00000000000..7e276dcaf4d --- /dev/null +++ b/fs/xfs/xfs.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_H__ +#define __XFS_H__ + +#include + +#include +#include + +#endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c new file mode 100644 index 00000000000..8d01dce8c53 --- /dev/null +++ b/fs/xfs/xfs_acl.c @@ -0,0 +1,937 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_inum.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_acl.h" +#include "xfs_mac.h" +#include "xfs_attr.h" + +#include + +STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *); +STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); +STATIC void xfs_acl_get_endian(xfs_acl_t *); +STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); +STATIC int xfs_acl_invalid(xfs_acl_t *); +STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); +STATIC void xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *); +STATIC void xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *); +STATIC int xfs_acl_allow_set(vnode_t *, int); + +kmem_zone_t *xfs_acl_zone; + + +/* + * Test for existence of access ACL attribute as efficiently as possible. + */ +int +xfs_acl_vhasacl_access( + vnode_t *vp) +{ + int error; + + xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error); + return (error == 0); +} + +/* + * Test for existence of default ACL attribute as efficiently as possible. + */ +int +xfs_acl_vhasacl_default( + vnode_t *vp) +{ + int error; + + if (vp->v_type != VDIR) + return 0; + xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error); + return (error == 0); +} + +/* + * Convert from extended attribute representation to in-memory for XFS. + */ +STATIC int +posix_acl_xattr_to_xfs( + posix_acl_xattr_header *src, + size_t size, + xfs_acl_t *dest) +{ + posix_acl_xattr_entry *src_entry; + xfs_acl_entry_t *dest_entry; + int n; + + if (!src || !dest) + return EINVAL; + + if (size < sizeof(posix_acl_xattr_header)) + return EINVAL; + + if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return EOPNOTSUPP; + + memset(dest, 0, sizeof(xfs_acl_t)); + dest->acl_cnt = posix_acl_xattr_count(size); + if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES) + return EINVAL; + + /* + * acl_set_file(3) may request that we set default ACLs with + * zero length -- defend (gracefully) against that here. + */ + if (!dest->acl_cnt) + return 0; + + src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src)); + dest_entry = &dest->acl_entry[0]; + + for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) { + dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm); + if (_ACL_PERM_INVALID(dest_entry->ae_perm)) + return EINVAL; + dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag); + switch(dest_entry->ae_tag) { + case ACL_USER: + case ACL_GROUP: + dest_entry->ae_id = le32_to_cpu(src_entry->e_id); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + dest_entry->ae_id = ACL_UNDEFINED_ID; + break; + default: + return EINVAL; + } + } + if (xfs_acl_invalid(dest)) + return EINVAL; + + return 0; +} + +/* + * Comparison function called from qsort(). + * Primary key is ae_tag, secondary key is ae_id. + */ +STATIC int +xfs_acl_entry_compare( + const void *va, + const void *vb) +{ + xfs_acl_entry_t *a = (xfs_acl_entry_t *)va, + *b = (xfs_acl_entry_t *)vb; + + if (a->ae_tag == b->ae_tag) + return (a->ae_id - b->ae_id); + return (a->ae_tag - b->ae_tag); +} + +/* + * Convert from in-memory XFS to extended attribute representation. + */ +STATIC int +posix_acl_xfs_to_xattr( + xfs_acl_t *src, + posix_acl_xattr_header *dest, + size_t size) +{ + int n; + size_t new_size = posix_acl_xattr_size(src->acl_cnt); + posix_acl_xattr_entry *dest_entry; + xfs_acl_entry_t *src_entry; + + if (size < new_size) + return -ERANGE; + + /* Need to sort src XFS ACL by */ + qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]), + xfs_acl_entry_compare); + + dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + dest_entry = &dest->a_entries[0]; + src_entry = &src->acl_entry[0]; + for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) { + dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm); + if (_ACL_PERM_INVALID(src_entry->ae_perm)) + return -EINVAL; + dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag); + switch (src_entry->ae_tag) { + case ACL_USER: + case ACL_GROUP: + dest_entry->e_id = cpu_to_le32(src_entry->ae_id); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + default: + return -EINVAL; + } + } + return new_size; +} + +int +xfs_acl_vget( + vnode_t *vp, + void *acl, + size_t size, + int kind) +{ + int error; + xfs_acl_t *xfs_acl = NULL; + posix_acl_xattr_header *ext_acl = acl; + int flags = 0; + + VN_HOLD(vp); + if(size) { + if (!(_ACL_ALLOC(xfs_acl))) { + error = ENOMEM; + goto out; + } + memset(xfs_acl, 0, sizeof(xfs_acl_t)); + } else + flags = ATTR_KERNOVAL; + + xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error); + if (error) + goto out; + + if (!size) { + error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES); + } else { + if (xfs_acl_invalid(xfs_acl)) { + error = EINVAL; + goto out; + } + if (kind == _ACL_TYPE_ACCESS) { + vattr_t va; + + va.va_mask = XFS_AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + if (error) + goto out; + xfs_acl_sync_mode(va.va_mode, xfs_acl); + } + error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); + } +out: + VN_RELE(vp); + if(xfs_acl) + _ACL_FREE(xfs_acl); + return -error; +} + +int +xfs_acl_vremove( + vnode_t *vp, + int kind) +{ + int error; + + VN_HOLD(vp); + error = xfs_acl_allow_set(vp, kind); + if (!error) { + VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT? + SGI_ACL_DEFAULT: SGI_ACL_FILE, + ATTR_ROOT, sys_cred, error); + if (error == ENOATTR) + error = 0; /* 'scool */ + } + VN_RELE(vp); + return -error; +} + +int +xfs_acl_vset( + vnode_t *vp, + void *acl, + size_t size, + int kind) +{ + posix_acl_xattr_header *ext_acl = acl; + xfs_acl_t *xfs_acl; + int error; + int basicperms = 0; /* more than std unix perms? */ + + if (!acl) + return -EINVAL; + + if (!(_ACL_ALLOC(xfs_acl))) + return -ENOMEM; + + error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl); + if (error) { + _ACL_FREE(xfs_acl); + return -error; + } + if (!xfs_acl->acl_cnt) { + _ACL_FREE(xfs_acl); + return 0; + } + + VN_HOLD(vp); + error = xfs_acl_allow_set(vp, kind); + if (error) + goto out; + + /* Incoming ACL exists, set file mode based on its value */ + if (kind == _ACL_TYPE_ACCESS) + xfs_acl_setmode(vp, xfs_acl, &basicperms); + + /* + * If we have more than std unix permissions, set up the actual attr. + * Otherwise, delete any existing attr. This prevents us from + * having actual attrs for permissions that can be stored in the + * standard permission bits. + */ + if (!basicperms) { + xfs_acl_set_attr(vp, xfs_acl, kind, &error); + } else { + xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); + } + +out: + VN_RELE(vp); + _ACL_FREE(xfs_acl); + return -error; +} + +int +xfs_acl_iaccess( + xfs_inode_t *ip, + mode_t mode, + cred_t *cr) +{ + xfs_acl_t *acl; + int rval; + + if (!(_ACL_ALLOC(acl))) + return -1; + + /* If the file has no ACL return -1. */ + rval = sizeof(xfs_acl_t); + if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE, + (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) { + _ACL_FREE(acl); + return -1; + } + xfs_acl_get_endian(acl); + + /* If the file has an empty ACL return -1. */ + if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) { + _ACL_FREE(acl); + return -1; + } + + /* Synchronize ACL with mode bits */ + xfs_acl_sync_mode(ip->i_d.di_mode, acl); + + rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr); + _ACL_FREE(acl); + return rval; +} + +STATIC int +xfs_acl_allow_set( + vnode_t *vp, + int kind) +{ + vattr_t va; + int error; + + if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + return EPERM; + if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR) + return ENOTDIR; + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + return EROFS; + va.va_mask = XFS_AT_UID; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return error; + if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) + return EPERM; + return error; +} + +/* + * The access control process to determine the access permission: + * if uid == file owner id, use the file owner bits. + * if gid == file owner group id, use the file group bits. + * scan ACL for a maching user or group, and use matched entry + * permission. Use total permissions of all matching group entries, + * until all acl entries are exhausted. The final permission produced + * by matching acl entry or entries needs to be & with group permission. + * if not owner, owning group, or matching entry in ACL, use file + * other bits. + */ +STATIC int +xfs_acl_capability_check( + mode_t mode, + cred_t *cr) +{ + if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH)) + return EACCES; + if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) + return EACCES; + if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) + return EACCES; + + return 0; +} + +/* + * Note: cr is only used here for the capability check if the ACL test fails. + * It is not used to find out the credentials uid or groups etc, as was + * done in IRIX. It is assumed that the uid and groups for the current + * thread are taken from "current" instead of the cr parameter. + */ +STATIC int +xfs_acl_access( + uid_t fuid, + gid_t fgid, + xfs_acl_t *fap, + mode_t md, + cred_t *cr) +{ + xfs_acl_entry_t matched; + int i, allows; + int maskallows = -1; /* true, but not 1, either */ + int seen_userobj = 0; + + matched.ae_tag = 0; /* Invalid type */ + md >>= 6; /* Normalize the bits for comparison */ + + for (i = 0; i < fap->acl_cnt; i++) { + /* + * Break out if we've got a user_obj entry or + * a user entry and the mask (and have processed USER_OBJ) + */ + if (matched.ae_tag == ACL_USER_OBJ) + break; + if (matched.ae_tag == ACL_USER) { + if (maskallows != -1 && seen_userobj) + break; + if (fap->acl_entry[i].ae_tag != ACL_MASK && + fap->acl_entry[i].ae_tag != ACL_USER_OBJ) + continue; + } + /* True if this entry allows the requested access */ + allows = ((fap->acl_entry[i].ae_perm & md) == md); + + switch (fap->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + seen_userobj = 1; + if (fuid != current->fsuid) + continue; + matched.ae_tag = ACL_USER_OBJ; + matched.ae_perm = allows; + break; + case ACL_USER: + if (fap->acl_entry[i].ae_id != current->fsuid) + continue; + matched.ae_tag = ACL_USER; + matched.ae_perm = allows; + break; + case ACL_GROUP_OBJ: + if ((matched.ae_tag == ACL_GROUP_OBJ || + matched.ae_tag == ACL_GROUP) && !allows) + continue; + if (!in_group_p(fgid)) + continue; + matched.ae_tag = ACL_GROUP_OBJ; + matched.ae_perm = allows; + break; + case ACL_GROUP: + if ((matched.ae_tag == ACL_GROUP_OBJ || + matched.ae_tag == ACL_GROUP) && !allows) + continue; + if (!in_group_p(fap->acl_entry[i].ae_id)) + continue; + matched.ae_tag = ACL_GROUP; + matched.ae_perm = allows; + break; + case ACL_MASK: + maskallows = allows; + break; + case ACL_OTHER: + if (matched.ae_tag != 0) + continue; + matched.ae_tag = ACL_OTHER; + matched.ae_perm = allows; + break; + } + } + /* + * First possibility is that no matched entry allows access. + * The capability to override DAC may exist, so check for it. + */ + switch (matched.ae_tag) { + case ACL_OTHER: + case ACL_USER_OBJ: + if (matched.ae_perm) + return 0; + break; + case ACL_USER: + case ACL_GROUP_OBJ: + case ACL_GROUP: + if (maskallows && matched.ae_perm) + return 0; + break; + case 0: + break; + } + + return xfs_acl_capability_check(md, cr); +} + +/* + * ACL validity checker. + * This acl validation routine checks each ACL entry read in makes sense. + */ +STATIC int +xfs_acl_invalid( + xfs_acl_t *aclp) +{ + xfs_acl_entry_t *entry, *e; + int user = 0, group = 0, other = 0, mask = 0; + int mask_required = 0; + int i, j; + + if (!aclp) + goto acl_invalid; + + if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES) + goto acl_invalid; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &aclp->acl_entry[i]; + switch (entry->ae_tag) { + case ACL_USER_OBJ: + if (user++) + goto acl_invalid; + break; + case ACL_GROUP_OBJ: + if (group++) + goto acl_invalid; + break; + case ACL_OTHER: + if (other++) + goto acl_invalid; + break; + case ACL_USER: + case ACL_GROUP: + for (j = i + 1; j < aclp->acl_cnt; j++) { + e = &aclp->acl_entry[j]; + if (e->ae_id == entry->ae_id && + e->ae_tag == entry->ae_tag) + goto acl_invalid; + } + mask_required++; + break; + case ACL_MASK: + if (mask++) + goto acl_invalid; + break; + default: + goto acl_invalid; + } + } + if (!user || !group || !other || (mask_required && !mask)) + goto acl_invalid; + else + return 0; +acl_invalid: + return EINVAL; +} + +/* + * Do ACL endian conversion. + */ +STATIC void +xfs_acl_get_endian( + xfs_acl_t *aclp) +{ + xfs_acl_entry_t *ace, *end; + + INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); + end = &aclp->acl_entry[0]+aclp->acl_cnt; + for (ace = &aclp->acl_entry[0]; ace < end; ace++) { + INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag); + INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id); + INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm); + } +} + +/* + * Get the ACL from the EA and do endian conversion. + */ +STATIC void +xfs_acl_get_attr( + vnode_t *vp, + xfs_acl_t *aclp, + int kind, + int flags, + int *error) +{ + int len = sizeof(xfs_acl_t); + + ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1); + flags |= ATTR_ROOT; + VOP_ATTR_GET(vp, + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT, + (char *)aclp, &len, flags, sys_cred, *error); + if (*error || (flags & ATTR_KERNOVAL)) + return; + xfs_acl_get_endian(aclp); +} + +/* + * Set the EA with the ACL and do endian conversion. + */ +STATIC void +xfs_acl_set_attr( + vnode_t *vp, + xfs_acl_t *aclp, + int kind, + int *error) +{ + xfs_acl_entry_t *ace, *newace, *end; + xfs_acl_t *newacl; + int len; + + if (!(_ACL_ALLOC(newacl))) { + *error = ENOMEM; + return; + } + + len = sizeof(xfs_acl_t) - + (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt)); + end = &aclp->acl_entry[0]+aclp->acl_cnt; + for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0]; + ace < end; + ace++, newace++) { + INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag); + INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id); + INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); + } + INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); + VOP_ATTR_SET(vp, + kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT, + (char *)newacl, len, ATTR_ROOT, sys_cred, *error); + _ACL_FREE(newacl); +} + +int +xfs_acl_vtoacl( + vnode_t *vp, + xfs_acl_t *access_acl, + xfs_acl_t *default_acl) +{ + vattr_t va; + int error = 0; + + if (access_acl) { + /* + * Get the Access ACL and the mode. If either cannot + * be obtained for some reason, invalidate the access ACL. + */ + xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error); + if (!error) { + /* Got the ACL, need the mode... */ + va.va_mask = XFS_AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + } + + if (error) + access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; + else /* We have a good ACL and the file mode, synchronize. */ + xfs_acl_sync_mode(va.va_mode, access_acl); + } + + if (default_acl) { + xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error); + if (error) + default_acl->acl_cnt = XFS_ACL_NOT_PRESENT; + } + return error; +} + +/* + * This function retrieves the parent directory's acl, processes it + * and lets the child inherit the acl(s) that it should. + */ +int +xfs_acl_inherit( + vnode_t *vp, + vattr_t *vap, + xfs_acl_t *pdaclp) +{ + xfs_acl_t *cacl; + int error = 0; + int basicperms = 0; + + /* + * If the parent does not have a default ACL, or it's an + * invalid ACL, we're done. + */ + if (!vp) + return 0; + if (!pdaclp || xfs_acl_invalid(pdaclp)) + return 0; + + /* + * Copy the default ACL of the containing directory to + * the access ACL of the new file and use the mode that + * was passed in to set up the correct initial values for + * the u::,g::[m::], and o:: entries. This is what makes + * umask() "work" with ACL's. + */ + + if (!(_ACL_ALLOC(cacl))) + return ENOMEM; + + memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); + xfs_acl_filter_mode(vap->va_mode, cacl); + xfs_acl_setmode(vp, cacl, &basicperms); + + /* + * Set the Default and Access ACL on the file. The mode is already + * set on the file, so we don't need to worry about that. + * + * If the new file is a directory, its default ACL is a copy of + * the containing directory's default ACL. + */ + if (vp->v_type == VDIR) + xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); + if (!error && !basicperms) + xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); + _ACL_FREE(cacl); + return error; +} + +/* + * Set up the correct mode on the file based on the supplied ACL. This + * makes sure that the mode on the file reflects the state of the + * u::,g::[m::], and o:: entries in the ACL. Since the mode is where + * the ACL is going to get the permissions for these entries, we must + * synchronize the mode whenever we set the ACL on a file. + */ +STATIC int +xfs_acl_setmode( + vnode_t *vp, + xfs_acl_t *acl, + int *basicperms) +{ + vattr_t va; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + int i, error, nomask = 1; + + *basicperms = 1; + + if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) + return 0; + + /* + * Copy the u::, g::, o::, and m:: bits from the ACL into the + * mode. The m:: bits take precedence over the g:: bits. + */ + va.va_mask = XFS_AT_MODE; + VOP_GETATTR(vp, &va, 0, sys_cred, error); + if (error) + return error; + + va.va_mask = XFS_AT_MODE; + va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); + ap = acl->acl_entry; + for (i = 0; i < acl->acl_cnt; ++i) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + va.va_mode |= ap->ae_perm << 6; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: /* more than just standard modes */ + nomask = 0; + va.va_mode |= ap->ae_perm << 3; + *basicperms = 0; + break; + case ACL_OTHER: + va.va_mode |= ap->ae_perm; + break; + default: /* more than just standard modes */ + *basicperms = 0; + break; + } + ap++; + } + + /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + va.va_mode |= gap->ae_perm << 3; + + VOP_SETATTR(vp, &va, 0, sys_cred, error); + return error; +} + +/* + * The permissions for the special ACL entries (u::, g::[m::], o::) are + * actually stored in the file mode (if there is both a group and a mask, + * the group is stored in the ACL entry and the mask is stored on the file). + * This allows the mode to remain automatically in sync with the ACL without + * the need for a call-back to the ACL system at every point where the mode + * could change. This function takes the permissions from the specified mode + * and places it in the supplied ACL. + * + * This implementation draws its validity from the fact that, when the ACL + * was assigned, the mode was copied from the ACL. + * If the mode did not change, therefore, the mode remains exactly what was + * taken from the special ACL entries at assignment. + * If a subsequent chmod() was done, the POSIX spec says that the change in + * mode must cause an update to the ACL seen at user level and used for + * access checks. Before and after a mode change, therefore, the file mode + * most accurately reflects what the special ACL entries should permit/deny. + * + * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly, + * the existing mode bits will override whatever is in the + * ACL. Similarly, if there is a pre-existing ACL that was + * never in sync with its mode (owing to a bug in 6.5 and + * before), it will now magically (or mystically) be + * synchronized. This could cause slight astonishment, but + * it is better than inconsistent permissions. + * + * The supplied ACL is a template that may contain any combination + * of special entries. These are treated as place holders when we fill + * out the ACL. This routine does not add or remove special entries, it + * simply unites each special entry with its associated set of permissions. + */ +STATIC void +xfs_acl_sync_mode( + mode_t mode, + xfs_acl_t *acl) +{ + int i, nomask = 1; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + + /* + * Set ACL entries. POSIX1003.1eD16 requires that the MASK + * be set instead of the GROUP entry, if there is a MASK. + */ + for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + ap->ae_perm = (mode >> 6) & 0x7; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: + nomask = 0; + ap->ae_perm = (mode >> 3) & 0x7; + break; + case ACL_OTHER: + ap->ae_perm = mode & 0x7; + break; + default: + break; + } + } + /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + gap->ae_perm = (mode >> 3) & 0x7; +} + +/* + * When inheriting an Access ACL from a directory Default ACL, + * the ACL bits are set to the intersection of the ACL default + * permission bits and the file permission bits in mode. If there + * are no permission bits on the file then we must not give them + * the ACL. This is what what makes umask() work with ACLs. + */ +STATIC void +xfs_acl_filter_mode( + mode_t mode, + xfs_acl_t *acl) +{ + int i, nomask = 1; + xfs_acl_entry_t *ap; + xfs_acl_entry_t *gap = NULL; + + /* + * Set ACL entries. POSIX1003.1eD16 requires that the MASK + * be merged with GROUP entry, if there is a MASK. + */ + for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { + switch (ap->ae_tag) { + case ACL_USER_OBJ: + ap->ae_perm &= (mode >> 6) & 0x7; + break; + case ACL_GROUP_OBJ: + gap = ap; + break; + case ACL_MASK: + nomask = 0; + ap->ae_perm &= (mode >> 3) & 0x7; + break; + case ACL_OTHER: + ap->ae_perm &= mode & 0x7; + break; + default: + break; + } + } + /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ + if (gap && nomask) + gap->ae_perm &= (mode >> 3) & 0x7; +} diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h new file mode 100644 index 00000000000..0363eb46d35 --- /dev/null +++ b/fs/xfs/xfs_acl.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ACL_H__ +#define __XFS_ACL_H__ + +/* + * Access Control Lists + */ +typedef __uint16_t xfs_acl_perm_t; +typedef __int32_t xfs_acl_type_t; +typedef __int32_t xfs_acl_tag_t; +typedef __int32_t xfs_acl_id_t; + +#define XFS_ACL_MAX_ENTRIES 25 +#define XFS_ACL_NOT_PRESENT (-1) + +typedef struct xfs_acl_entry { + xfs_acl_tag_t ae_tag; + xfs_acl_id_t ae_id; + xfs_acl_perm_t ae_perm; +} xfs_acl_entry_t; + +typedef struct xfs_acl { + __int32_t acl_cnt; + xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES]; +} xfs_acl_t; + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + + +#ifdef CONFIG_XFS_POSIX_ACL + +struct vattr; +struct vnode; +struct xfs_inode; + +extern struct kmem_zone *xfs_acl_zone; +#define xfs_acl_zone_init(zone, name) \ + (zone) = kmem_zone_init(sizeof(xfs_acl_t), name) +#define xfs_acl_zone_destroy(zone) kmem_cache_destroy(zone) + +extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *); +extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); +extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *); +extern int xfs_acl_vhasacl_access(struct vnode *); +extern int xfs_acl_vhasacl_default(struct vnode *); +extern int xfs_acl_vset(struct vnode *, void *, size_t, int); +extern int xfs_acl_vget(struct vnode *, void *, size_t, int); +extern int xfs_acl_vremove(struct vnode *vp, int); + +#define _ACL_TYPE_ACCESS 1 +#define _ACL_TYPE_DEFAULT 2 +#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) + +#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d)) +#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0) +#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0) +#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access +#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default +#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1) + +#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP)) +#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0) + +#else +#define xfs_acl_zone_init(zone,name) +#define xfs_acl_zone_destroy(zone) +#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP) +#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP) +#define xfs_acl_vremove(v,t) (-EOPNOTSUPP) +#define xfs_acl_vhasacl_access(v) (0) +#define xfs_acl_vhasacl_default(v) (0) +#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */ +#define _ACL_FREE(a) ((void)0) +#define _ACL_INHERIT(c,v,d) (0) +#define _ACL_GET_ACCESS(pv,pa) (0) +#define _ACL_GET_DEFAULT(pv,pd) (0) +#define _ACL_ACCESS_EXISTS (NULL) +#define _ACL_DEFAULT_EXISTS (NULL) +#define _ACL_XFS_IACCESS(i,m,c) (-1) +#endif + +#endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h new file mode 100644 index 00000000000..96b70f7fba3 --- /dev/null +++ b/fs/xfs/xfs_ag.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_AG_H__ +#define __XFS_AG_H__ + +/* + * Allocation group header + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ + +struct xfs_buf; +struct xfs_mount; +struct xfs_trans; + +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION) +int xfs_agf_good_version(unsigned v); +#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v) +#else +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION) +int xfs_agi_good_version(unsigned v); +#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v) +#else +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) +#endif + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf +{ + /* + * Common allocation group header information + */ + __uint32_t agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __uint32_t agf_versionnum; /* header version == XFS_AGF_VERSION */ + xfs_agnumber_t agf_seqno; /* sequence # starting from 0 */ + xfs_agblock_t agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + xfs_agblock_t agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __uint32_t agf_spare0; /* spare field */ + __uint32_t agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __uint32_t agf_spare1; /* spare field */ + __uint32_t agf_flfirst; /* first freelist block's index */ + __uint32_t agf_fllast; /* last freelist block's index */ + __uint32_t agf_flcount; /* count of blocks in freelist */ + xfs_extlen_t agf_freeblks; /* total free blocks */ + xfs_extlen_t agf_longest; /* longest free space */ +} xfs_agf_t; + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_NUM_BITS 11 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK) +xfs_agblock_t xfs_agf_block(struct xfs_mount *mp); +#define XFS_AGF_BLOCK(mp) xfs_agf_block(mp) +#else +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#endif + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi +{ + /* + * Common allocation group header information + */ + __uint32_t agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __uint32_t agi_versionnum; /* header version == XFS_AGI_VERSION */ + xfs_agnumber_t agi_seqno; /* sequence # starting from 0 */ + xfs_agblock_t agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + xfs_agino_t agi_count; /* count of allocated inodes */ + xfs_agblock_t agi_root; /* root of inode btree */ + __uint32_t agi_level; /* levels in inode btree */ + xfs_agino_t agi_freecount; /* number of free inodes */ + xfs_agino_t agi_newino; /* new inode just allocated */ + xfs_agino_t agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + xfs_agino_t agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; +} xfs_agi_t; + +#define XFS_AGI_MAGICNUM 0x00000001 +#define XFS_AGI_VERSIONNUM 0x00000002 +#define XFS_AGI_SEQNO 0x00000004 +#define XFS_AGI_LENGTH 0x00000008 +#define XFS_AGI_COUNT 0x00000010 +#define XFS_AGI_ROOT 0x00000020 +#define XFS_AGI_LEVEL 0x00000040 +#define XFS_AGI_FREECOUNT 0x00000080 +#define XFS_AGI_NEWINO 0x00000100 +#define XFS_AGI_DIRINO 0x00000200 +#define XFS_AGI_UNLINKED 0x00000400 +#define XFS_AGI_NUM_BITS 11 +#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK) +xfs_agblock_t xfs_agi_block(struct xfs_mount *mp); +#define XFS_AGI_BLOCK(mp) xfs_agi_block(mp) +#else +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#endif + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK) +xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp); +#define XFS_AGFL_BLOCK(mp) xfs_agfl_block(mp) +#else +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#endif +#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +/* + * Busy block/extent entry. Used in perag to mark blocks that have been freed + * but whose transactions aren't committed to disk yet. + */ +typedef struct xfs_perag_busy { + xfs_agblock_t busy_start; + xfs_extlen_t busy_length; + struct xfs_trans *busy_tp; /* transaction that did the free */ +} xfs_perag_busy_t; + +/* + * Per-ag incore structure, copies of information in agf and agi, + * to improve the performance of allocation group selection. + * + * pick sizes which fit in allocation buckets well + */ +#if (BITS_PER_LONG == 32) +#define XFS_PAGB_NUM_SLOTS 84 +#elif (BITS_PER_LONG == 64) +#define XFS_PAGB_NUM_SLOTS 128 +#endif + +typedef struct xfs_perag +{ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is prefered to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + xfs_agino_t pagi_freecount; /* number of free inodes */ +#ifdef __KERNEL__ + lock_t pagb_lock; /* lock for pagb_list */ +#endif + int pagb_count; /* pagb slots in use */ + xfs_perag_busy_t *pagb_list; /* unstable blocks */ +} xfs_perag_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS) +int xfs_ag_maxlevels(struct xfs_mount *mp); +#define XFS_AG_MAXLEVELS(mp) xfs_ag_maxlevels(mp) +#else +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST) +int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp); +#define XFS_MIN_FREELIST(a,mp) xfs_min_freelist(a,mp) +#else +#define XFS_MIN_FREELIST(a,mp) \ + XFS_MIN_FREELIST_RAW( \ + INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \ + INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG) +int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp); +#define XFS_MIN_FREELIST_PAG(pag,mp) xfs_min_freelist_pag(pag,mp) +#else +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW) +int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp); +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) xfs_min_freelist_raw(bl,cl,mp) +#else +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \ + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB) +xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +#define XFS_AGB_TO_FSB(mp,agno,agbno) xfs_agb_to_fsb(mp,agno,agbno) +#else +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO) +xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +#define XFS_FSB_TO_AGNO(mp,fsbno) xfs_fsb_to_agno(mp,fsbno) +#else +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO) +xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); +#define XFS_FSB_TO_AGBNO(mp,fsbno) xfs_fsb_to_agbno(mp,fsbno) +#else +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR) +xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno); +#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno) +#else +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))) +#endif +/* + * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h + * to avoid header file ordering change + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR) +xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_daddr_t d); +#define XFS_AG_DADDR(mp,agno,d) xfs_ag_daddr(mp,agno,d) +#else +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF) +xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp); +#define XFS_BUF_TO_AGF(bp) xfs_buf_to_agf(bp) +#else +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI) +xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp); +#define XFS_BUF_TO_AGI(bp) xfs_buf_to_agi(bp) +#else +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL) +xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp); +#define XFS_BUF_TO_AGFL(bp) xfs_buf_to_agfl(bp) +#else +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#endif + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR) +void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len); +#define XFS_AG_CHECK_DADDR(mp,d,len) xfs_ag_check_daddr(mp,d,len) +#else +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ + ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ + XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) +#endif + +#endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c new file mode 100644 index 00000000000..36603db10fe --- /dev/null +++ b/fs/xfs/xfs_alloc.c @@ -0,0 +1,2623 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Free space allocation for XFS. + */ +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_error.h" + + +#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) + +#define XFSA_FIXUP_BNO_OK 1 +#define XFSA_FIXUP_CNT_OK 2 + +int +xfs_alloc_search_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len); + +#if defined(XFS_ALLOC_TRACE) +ktrace_t *xfs_alloc_trace_buf; + +#define TRACE_ALLOC(s,a) \ + xfs_alloc_trace_alloc(fname, s, a, __LINE__) +#define TRACE_FREE(s,a,b,x,f) \ + xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) +#define TRACE_MODAGF(s,a,f) \ + xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) +#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) +#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ + xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) +#else +#define TRACE_ALLOC(s,a) +#define TRACE_FREE(s,a,b,x,f) +#define TRACE_MODAGF(s,a,f) +#define TRACE_BUSY(s,a,ag,agb,l,sl,tp) +#define TRACE_UNBUSY(fname,s,ag,sl,tp) +#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) +#endif /* XFS_ALLOC_TRACE */ + +/* + * Prototypes for per-ag allocation routines + */ + +STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + +/* + * Internal functions. + */ + +/* + * Compute aligned version of the found extent. + * Takes alignment and min length into account. + */ +STATIC int /* success (>= minlen) */ +xfs_alloc_compute_aligned( + xfs_agblock_t foundbno, /* starting block in found extent */ + xfs_extlen_t foundlen, /* length in found extent */ + xfs_extlen_t alignment, /* alignment for allocation */ + xfs_extlen_t minlen, /* minimum length for allocation */ + xfs_agblock_t *resbno, /* result block number */ + xfs_extlen_t *reslen) /* result length */ +{ + xfs_agblock_t bno; + xfs_extlen_t diff; + xfs_extlen_t len; + + if (alignment > 1 && foundlen >= minlen) { + bno = roundup(foundbno, alignment); + diff = bno - foundbno; + len = diff >= foundlen ? 0 : foundlen - diff; + } else { + bno = foundbno; + len = foundlen; + } + *resbno = bno; + *reslen = len; + return len >= minlen; +} + +/* + * Compute best start block and diff for "near" allocations. + * freelen >= wantlen already checked by caller. + */ +STATIC xfs_extlen_t /* difference value (absolute) */ +xfs_alloc_compute_diff( + xfs_agblock_t wantbno, /* target starting block */ + xfs_extlen_t wantlen, /* target length */ + xfs_extlen_t alignment, /* target alignment */ + xfs_agblock_t freebno, /* freespace's starting block */ + xfs_extlen_t freelen, /* freespace's length */ + xfs_agblock_t *newbnop) /* result: best start block from free */ +{ + xfs_agblock_t freeend; /* end of freespace extent */ + xfs_agblock_t newbno1; /* return block number */ + xfs_agblock_t newbno2; /* other new block number */ + xfs_extlen_t newlen1=0; /* length with newbno1 */ + xfs_extlen_t newlen2=0; /* length with newbno2 */ + xfs_agblock_t wantend; /* end of target extent */ + + ASSERT(freelen >= wantlen); + freeend = freebno + freelen; + wantend = wantbno + wantlen; + if (freebno >= wantbno) { + if ((newbno1 = roundup(freebno, alignment)) >= freeend) + newbno1 = NULLAGBLOCK; + } else if (freeend >= wantend && alignment > 1) { + newbno1 = roundup(wantbno, alignment); + newbno2 = newbno1 - alignment; + if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + else + newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); + if (newbno2 < freebno) + newbno2 = NULLAGBLOCK; + else + newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); + if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { + if (newlen1 < newlen2 || + (newlen1 == newlen2 && + XFS_ABSDIFF(newbno1, wantbno) > + XFS_ABSDIFF(newbno2, wantbno))) + newbno1 = newbno2; + } else if (newbno2 != NULLAGBLOCK) + newbno1 = newbno2; + } else if (freeend >= wantend) { + newbno1 = wantbno; + } else if (alignment > 1) { + newbno1 = roundup(freeend - wantlen, alignment); + if (newbno1 > freeend - wantlen && + newbno1 - alignment >= freebno) + newbno1 -= alignment; + else if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + } else + newbno1 = freeend - wantlen; + *newbnop = newbno1; + return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); +} + +/* + * Fix up the length, based on mod and prod. + * len should be k * prod + mod for some k. + * If len is too small it is returned unchanged. + * If len hits maxlen it is left alone. + */ +STATIC void +xfs_alloc_fix_len( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_extlen_t k; + xfs_extlen_t rlen; + + ASSERT(args->mod < args->prod); + rlen = args->len; + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || + (args->mod == 0 && rlen < args->prod)) + return; + k = rlen % args->prod; + if (k == args->mod) + return; + if (k > args->mod) { + if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) + return; + } else { + if ((int)(rlen = rlen - args->prod - (args->mod - k)) < + (int)args->minlen) + return; + } + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + args->len = rlen; +} + +/* + * Fix up length if there is too little space left in the a.g. + * Return 1 if ok, 0 if too little, should give up. + */ +STATIC int +xfs_alloc_fix_minleft( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agf_t *agf; /* a.g. freelist header */ + int diff; /* free space difference */ + + if (args->minleft == 0) + return 1; + agf = XFS_BUF_TO_AGF(args->agbp); + diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT) + + INT_GET(agf->agf_flcount, ARCH_CONVERT) + - args->len - args->minleft; + if (diff >= 0) + return 1; + args->len += diff; /* shrink the allocated space */ + if (args->len >= args->minlen) + return 1; + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Update the two btrees, logically removing from freespace the extent + * starting at rbno, rlen blocks. The extent is contained within the + * actual (current) free extent fbno for flen blocks. + * Flags are passed in indicating whether the cursors are set to the + * relevant records. + */ +STATIC int /* error code */ +xfs_alloc_fixup_trees( + xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ + xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + xfs_agblock_t fbno, /* starting block of free extent */ + xfs_extlen_t flen, /* length of free extent */ + xfs_agblock_t rbno, /* starting block of returned extent */ + xfs_extlen_t rlen, /* length of returned extent */ + int flags) /* flags, XFSA_FIXUP_... */ +{ + int error; /* error code */ + int i; /* operation results */ + xfs_agblock_t nfbno1; /* first new free startblock */ + xfs_agblock_t nfbno2; /* second new free startblock */ + xfs_extlen_t nflen1=0; /* first new free length */ + xfs_extlen_t nflen2=0; /* second new free length */ + + /* + * Look up the record in the by-size tree if necessary. + */ + if (flags & XFSA_FIXUP_CNT_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Look up the record in the by-block tree if necessary. + */ + if (flags & XFSA_FIXUP_BNO_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN( + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } +#ifdef DEBUG + { + xfs_alloc_block_t *bnoblock; + xfs_alloc_block_t *cntblock; + + if (bno_cur->bc_nlevels == 1 && + cnt_cur->bc_nlevels == 1) { + bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]); + XFS_WANT_CORRUPTED_RETURN( + INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT)); + } + } +#endif + /* + * Deal with all four cases: the allocated record is contained + * within the freespace record, so we can have new freespace + * at either (or both) end, or no freespace remaining. + */ + if (rbno == fbno && rlen == flen) + nfbno1 = nfbno2 = NULLAGBLOCK; + else if (rbno == fbno) { + nfbno1 = rbno + rlen; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else if (rbno + rlen == fbno + flen) { + nfbno1 = fbno; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else { + nfbno1 = fbno; + nflen1 = rbno - fbno; + nfbno2 = rbno + rlen; + nflen2 = (fbno + flen) - nfbno2; + } + /* + * Delete the entry from the by-size btree. + */ + if ((error = xfs_alloc_delete(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + /* + * Add new by-size btree entry(s). + */ + if (nfbno1 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + if (nfbno2 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + /* + * Fix up the by-block btree entry(s). + */ + if (nfbno1 == NULLAGBLOCK) { + /* + * No remaining freespace, just delete the by-block tree entry. + */ + if ((error = xfs_alloc_delete(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } else { + /* + * Update the by-block entry to start later|be shorter. + */ + if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) + return error; + } + if (nfbno2 != NULLAGBLOCK) { + /* + * 2 resulting free entries, need to add one. + */ + if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 0); + if ((error = xfs_alloc_insert(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + return 0; +} + +/* + * Read in the allocation group free block array. + */ +STATIC int /* error */ +xfs_alloc_read_agfl( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* buffer for the ag free block array */ +{ + xfs_buf_t *bp; /* return value */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + *bpp = bp; + return 0; +} + +#if defined(XFS_ALLOC_TRACE) +/* + * Add an allocation trace entry for an alloc call. + */ +STATIC void +xfs_alloc_trace_alloc( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_alloc_arg_t *args, /* allocation argument structure */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)), + (void *)name, + (void *)str, + (void *)args->mp, + (void *)(__psunsigned_t)args->agno, + (void *)(__psunsigned_t)args->agbno, + (void *)(__psunsigned_t)args->minlen, + (void *)(__psunsigned_t)args->maxlen, + (void *)(__psunsigned_t)args->mod, + (void *)(__psunsigned_t)args->prod, + (void *)(__psunsigned_t)args->minleft, + (void *)(__psunsigned_t)args->total, + (void *)(__psunsigned_t)args->alignment, + (void *)(__psunsigned_t)args->len, + (void *)((((__psint_t)args->type) << 16) | + (__psint_t)args->otype), + (void *)(__psint_t)((args->wasdel << 3) | + (args->wasfromfl << 2) | + (args->isfl << 1) | + (args->userdata << 0))); +} + +/* + * Add an allocation trace entry for a free call. + */ +STATIC void +xfs_alloc_trace_free( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* a.g. relative block number */ + xfs_extlen_t len, /* length of extent */ + int isfl, /* set if is freelist allocation/free */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psunsigned_t)agno, + (void *)(__psunsigned_t)agbno, + (void *)(__psunsigned_t)len, + (void *)(__psint_t)isfl, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add an allocation trace entry for modifying an agf. + */ +STATIC void +xfs_alloc_trace_modagf( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount point */ + xfs_agf_t *agf, /* new agf value */ + int flags, /* logging flags for agf */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psint_t)flags, + (void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO], + ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT], + ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO], + ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT], + ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT), + (void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT)); +} + +STATIC void +xfs_alloc_trace_busy( + char *name, /* function tag string */ + char *str, /* additional string */ + xfs_mount_t *mp, /* file system mount poing */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* a.g. relative block number */ + xfs_extlen_t len, /* length of extent */ + int slot, /* perag Busy slot */ + xfs_trans_t *tp, + int trtype, /* type: add, delete, search */ + int line) /* source line number */ +{ + ktrace_enter(xfs_alloc_trace_buf, + (void *)(__psint_t)(trtype | (line << 16)), + (void *)name, + (void *)str, + (void *)mp, + (void *)(__psunsigned_t)agno, + (void *)(__psunsigned_t)agbno, + (void *)(__psunsigned_t)len, + (void *)(__psint_t)slot, + (void *)tp, + NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} +#endif /* XFS_ALLOC_TRACE */ + +/* + * Allocation group level functions. + */ + +/* + * Allocate a variable extent in the allocation group agno. + * Type and bno are used to determine where in the allocation group the + * extent will start. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent( + xfs_alloc_arg_t *args) /* argument structure for allocation */ +{ + int error=0; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent"; +#endif + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->mod < args->prod); + ASSERT(args->alignment > 0); + /* + * Branch to correct routine based on the type. + */ + args->wasfromfl = 0; + switch (args->type) { + case XFS_ALLOCTYPE_THIS_AG: + error = xfs_alloc_ag_vextent_size(args); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_alloc_ag_vextent_near(args); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_alloc_ag_vextent_exact(args); + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (error) + return error; + /* + * If the allocation worked, need to change the agf structure + * (and log it), and the superblock. + */ + if (args->agbno != NULLAGBLOCK) { + xfs_agf_t *agf; /* allocation group freelist header */ +#ifdef XFS_ALLOC_TRACE + xfs_mount_t *mp = args->mp; +#endif + long slen = (long)args->len; + + ASSERT(args->len >= args->minlen && args->len <= args->maxlen); + ASSERT(!(args->wasfromfl) || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + if (!(args->wasfromfl)) { + + agf = XFS_BUF_TO_AGF(args->agbp); + INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len)); + xfs_trans_agblocks_delta(args->tp, + -((long)(args->len))); + args->pag->pagf_freeblks -= args->len; + ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT) + <= INT_GET(agf->agf_length, ARCH_CONVERT)); + TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); + xfs_alloc_log_agf(args->tp, args->agbp, + XFS_AGF_FREEBLKS); + /* search the busylist for these blocks */ + xfs_alloc_search_busy(args->tp, args->agno, + args->agbno, args->len); + } + if (!args->isfl) + xfs_trans_mod_sb(args->tp, + args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, -slen); + XFS_STATS_INC(xs_allocx); + XFS_STATS_ADD(xs_allocb, args->len); + } + return 0; +} + +/* + * Allocate a variable extent at exactly agno/bno. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_exact( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ + xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + xfs_agblock_t end; /* end of allocated extent */ + int error; + xfs_agblock_t fbno; /* start block of found extent */ + xfs_agblock_t fend; /* end block of found extent */ + xfs_extlen_t flen; /* length of found extent */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_exact"; +#endif + int i; /* success/failure of operation */ + xfs_agblock_t maxend; /* end of maximal extent */ + xfs_agblock_t minend; /* end of minimal extent */ + xfs_extlen_t rlen; /* length of returned extent */ + + ASSERT(args->alignment == 1); + /* + * Allocate/initialize a cursor for the by-number freespace btree. + */ + bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, NULL, 0); + /* + * Lookup bno and minlen in the btree (minlen is irrelevant, really). + * Look for the closest free block <= bno, it must contain bno + * if any free block does. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find it, return null. + */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * Grab the freespace record. + */ + if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ASSERT(fbno <= args->agbno); + minend = args->agbno + args->minlen; + maxend = args->agbno + args->maxlen; + fend = fbno + flen; + /* + * Give up if the freespace isn't long enough for the minimum request. + */ + if (fend < minend) { + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + */ + end = XFS_AGBLOCK_MIN(fend, maxend); + /* + * Fix the length according to mod and prod if given. + */ + args->len = end - args->agbno; + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + ASSERT(args->agbno + rlen <= fend); + end = args->agbno + rlen; + /* + * We are allocating agbno for rlen [agbno .. end] + * Allocate/initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, NULL, 0); + ASSERT(args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + goto error0; + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("normal", args); + args->wasfromfl = 0; + return 0; + +error0: + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + TRACE_ALLOC("error", args); + return error; +} + +/* + * Allocate a variable extent near bno in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_near( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ + xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ + xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_near"; +#endif + xfs_agblock_t gtbno; /* start bno of right side entry */ + xfs_agblock_t gtbnoa; /* aligned ... */ + xfs_extlen_t gtdiff; /* difference to right side entry */ + xfs_extlen_t gtlen; /* length of right side entry */ + xfs_extlen_t gtlena; /* aligned ... */ + xfs_agblock_t gtnew; /* useful start bno of right side */ + int error; /* error code */ + int i; /* result code, temporary */ + int j; /* result code, temporary */ + xfs_agblock_t ltbno; /* start bno of left side entry */ + xfs_agblock_t ltbnoa; /* aligned ... */ + xfs_extlen_t ltdiff; /* difference to left side entry */ + /*REFERENCED*/ + xfs_agblock_t ltend; /* end bno of left side entry */ + xfs_extlen_t ltlen; /* length of left side entry */ + xfs_extlen_t ltlena; /* aligned ... */ + xfs_agblock_t ltnew; /* useful start bno of left side */ + xfs_extlen_t rlen; /* length of returned extent */ +#if defined(DEBUG) && defined(__KERNEL__) + /* + * Randomly don't execute the first algorithm. + */ + int dofirst; /* set to do first algorithm */ + + dofirst = random() & 1; +#endif + /* + * Get a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, NULL, 0); + ltlen = 0; + bno_cur_lt = bno_cur_gt = NULL; + /* + * See if there are any free extents as big as maxlen. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, + <len, &i))) + goto error0; + if (i == 0 || ltlen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + ASSERT(i == 1); + } + args->wasfromfl = 0; + /* + * First algorithm. + * If the requested extent is large wrt the freespaces available + * in this a.g., then the cursor will be pointing to a btree entry + * near the right edge of the tree. If it's in the last btree leaf + * block, then we just examine all the entries in that block + * that are big enough, and pick the best one. + * This is written as a while loop so we can break out of it, + * but we never loop back to the top. + */ + while (xfs_btree_islastblock(cnt_cur, 0)) { + xfs_extlen_t bdiff; + int besti=0; + xfs_extlen_t blen=0; + xfs_agblock_t bnew=0; + +#if defined(DEBUG) && defined(__KERNEL__) + if (!dofirst) + break; +#endif + /* + * Start from the entry that lookup found, sequence through + * all larger free blocks. If we're actually pointing at a + * record smaller than maxlen, go to the start of this block, + * and skip all those smaller than minlen. + */ + if (ltlen || args->alignment > 1) { + cnt_cur->bc_ptrs[0] = 1; + do { + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (ltlen >= args->minlen) + break; + if ((error = xfs_alloc_increment(cnt_cur, 0, &i))) + goto error0; + } while (i); + ASSERT(ltlen >= args->minlen); + if (!i) + break; + } + i = cnt_cur->bc_ptrs[0]; + for (j = 1, blen = 0, bdiff = 0; + !error && j && (blen < args->maxlen || bdiff > 0); + error = xfs_alloc_increment(cnt_cur, 0, &j)) { + /* + * For each entry, decide if it's better than + * the previous best entry. + */ + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (!xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena)) + continue; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < blen) + continue; + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, ltbno, ltlen, <new); + if (ltnew != NULLAGBLOCK && + (args->len > blen || ltdiff < bdiff)) { + bdiff = ltdiff; + bnew = ltnew; + blen = args->len; + besti = cnt_cur->bc_ptrs[0]; + } + } + /* + * It didn't work. We COULD be in a case where + * there's a good record somewhere, so try again. + */ + if (blen == 0) + break; + /* + * Point at the best entry, and retrieve it again. + */ + cnt_cur->bc_ptrs[0] = besti; + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + ltend = ltbno + ltlen; + ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + args->len = blen; + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("nominleft", args); + return 0; + } + blen = args->len; + /* + * We are allocating starting at bnew for blen blocks. + */ + args->agbno = bnew; + ASSERT(bnew >= ltbno); + ASSERT(bnew + blen <= ltend); + /* + * Set up a cursor for the by-bno tree. + */ + bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0); + /* + * Fix up the btree entries. + */ + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, + ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + TRACE_ALLOC("first", args); + return 0; + } + /* + * Second algorithm. + * Search in the by-bno tree to the left and to the right + * simultaneously, until in each case we find a space big enough, + * or run into the edge of the tree. When we run into the edge, + * we deallocate that cursor. + * If both searches succeed, we compare the two spaces and pick + * the better one. + * With alignment, it's possible for both to fail; the upper + * level algorithm that picks allocation groups for allocations + * is not supposed to do this. + */ + /* + * Allocate and initialize the cursor for the leftward search. + */ + bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, NULL, 0); + /* + * Lookup <= bno to find the leftward search's starting point. + */ + if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find anything; use this cursor for the rightward + * search. + */ + bno_cur_gt = bno_cur_lt; + bno_cur_lt = NULL; + } + /* + * Found something. Duplicate the cursor for the rightward search. + */ + else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) + goto error0; + /* + * Increment the cursor, so we will point at the entry just right + * of the leftward entry if any, or to the leftmost entry. + */ + if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + /* + * It failed, there are no rightward entries. + */ + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Loop going left with the leftward cursor, right with the + * rightward cursor, until either both directions give up or + * we find an entry at least as big as minlen. + */ + do { + if (bno_cur_lt) { + if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena)) + break; + if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + if (bno_cur_gt) { + if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (xfs_alloc_compute_aligned(gtbno, gtlen, + args->alignment, args->minlen, + >bnoa, >lena)) + break; + if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + } while (bno_cur_lt || bno_cur_gt); + /* + * Got both cursors still active, need to find better entry. + */ + if (bno_cur_lt && bno_cur_gt) { + /* + * Left side is long enough, look for a right side entry. + */ + if (ltlena >= args->minlen) { + /* + * Fix up the length. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, + args->alignment, ltbno, ltlen, <new); + /* + * Not perfect. + */ + if (ltdiff) { + /* + * Look until we find a better one, run out of + * space, or run off the end. + */ + while (bno_cur_lt && bno_cur_gt) { + if ((error = xfs_alloc_get_rec( + bno_cur_gt, >bno, + >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(gtbno, gtlen, + args->alignment, args->minlen, + >bnoa, >lena); + /* + * The left one is clearly better. + */ + if (gtbnoa >= args->agbno + ltdiff) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + break; + } + /* + * If we reach a big enough entry, + * compare the two and pick the best. + */ + if (gtlena >= args->minlen) { + args->len = + XFS_EXTLEN_MIN(gtlena, + args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + gtdiff = xfs_alloc_compute_diff( + args->agbno, rlen, + args->alignment, + gtbno, gtlen, >new); + /* + * Right side is better. + */ + if (gtdiff < ltdiff) { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + /* + * Left side is better. + */ + else { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + break; + } + /* + * Fell off the right end. + */ + if ((error = xfs_alloc_increment( + bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + break; + } + } + } + /* + * The left side is perfect, trash the right side. + */ + else { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + /* + * It's the right side that was found first, look left. + */ + else { + /* + * Fix up the length. + */ + args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, + args->alignment, gtbno, gtlen, >new); + /* + * Right side entry isn't perfect. + */ + if (gtdiff) { + /* + * Look until we find a better one, run out of + * space, or run off the end. + */ + while (bno_cur_lt && bno_cur_gt) { + if ((error = xfs_alloc_get_rec( + bno_cur_lt, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(ltbno, ltlen, + args->alignment, args->minlen, + <bnoa, <lena); + /* + * The right one is clearly better. + */ + if (ltbnoa <= args->agbno - gtdiff) { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + break; + } + /* + * If we reach a big enough entry, + * compare the two and pick the best. + */ + if (ltlena >= args->minlen) { + args->len = XFS_EXTLEN_MIN( + ltlena, args->maxlen); + xfs_alloc_fix_len(args); + rlen = args->len; + ltdiff = xfs_alloc_compute_diff( + args->agbno, rlen, + args->alignment, + ltbno, ltlen, <new); + /* + * Left side is better. + */ + if (ltdiff < gtdiff) { + xfs_btree_del_cursor( + bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Right side is better. + */ + else { + xfs_btree_del_cursor( + bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + break; + } + /* + * Fell off the left end. + */ + if ((error = xfs_alloc_decrement( + bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + break; + } + } + } + /* + * The right side is perfect, trash the left side. + */ + else { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + } + /* + * If we couldn't get anything, give up. + */ + if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + TRACE_ALLOC("neither", args); + args->agbno = NULLAGBLOCK; + return 0; + } + /* + * At this point we have selected a freespace entry, either to the + * left or to the right. If it's on the right, copy all the + * useful variables to the "left" set so we only have one + * copy of this code. + */ + if (bno_cur_gt) { + bno_cur_lt = bno_cur_gt; + bno_cur_gt = NULL; + ltbno = gtbno; + ltbnoa = gtbnoa; + ltlen = gtlen; + ltlena = gtlena; + j = 1; + } else + j = 0; + /* + * Fix up the length and compute the useful address. + */ + ltend = ltbno + ltlen; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + TRACE_ALLOC("nominleft", args); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, + ltlen, <new); + ASSERT(ltnew >= ltbno); + ASSERT(ltnew + rlen <= ltend); + ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT)); + args->agbno = ltnew; + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, + ltnew, rlen, XFSA_FIXUP_BNO_OK))) + goto error0; + TRACE_ALLOC(j ? "gt" : "lt", args); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + return 0; + + error0: + TRACE_ALLOC("error", args); + if (cnt_cur != NULL) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur_lt != NULL) + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); + if (bno_cur_gt != NULL) + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); + return error; +} + +/* + * Allocate a variable extent anywhere in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_size( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + int error; /* error result */ + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_size"; +#endif + int i; /* temp status variable */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + + /* + * Allocate and initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT, NULL, 0); + bno_cur = NULL; + /* + * Look for an entry >= maxlen+alignment-1 blocks. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, + args->maxlen + args->alignment - 1, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, + &flen, &i))) + goto error0; + if (i == 0 || flen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("noentry", args); + return 0; + } + ASSERT(i == 1); + } + /* + * There's a freespace as big as maxlen+alignment-1, get it. + */ + else { + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + /* + * In the first case above, we got the last entry in the + * by-size btree. Now we check to see if the space hits maxlen + * once aligned; if not, we search left for something better. + * This can't happen in the second case above. + */ + xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, + &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (rlen < args->maxlen) { + xfs_agblock_t bestfbno; + xfs_extlen_t bestflen; + xfs_agblock_t bestrbno; + xfs_extlen_t bestrlen; + + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + for (;;) { + if ((error = xfs_alloc_decrement(cnt_cur, 0, &i))) + goto error0; + if (i == 0) + break; + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (flen < bestrlen) + break; + xfs_alloc_compute_aligned(fbno, flen, args->alignment, + args->minlen, &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), + error0); + if (rlen > bestrlen) { + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + if (rlen == args->maxlen) + break; + } + } + if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + rlen = bestrlen; + rbno = bestrbno; + flen = bestflen; + fbno = bestfbno; + } + args->wasfromfl = 0; + /* + * Fix up the length. + */ + args->len = rlen; + xfs_alloc_fix_len(args); + if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + TRACE_ALLOC("nominleft", args); + args->agbno = NULLAGBLOCK; + return 0; + } + rlen = args->len; + XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + /* + * Allocate and initialize a cursor for the by-block tree. + */ + bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO, NULL, 0); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + rbno, rlen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + cnt_cur = bno_cur = NULL; + args->len = rlen; + args->agbno = rbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT), + error0); + TRACE_ALLOC("normal", args); + return 0; + +error0: + TRACE_ALLOC("error", args); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Deal with the case where only small freespaces remain. + * Either return the contents of the last freespace record, + * or allocate space from the freelist if there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_btree_cur_t *ccur, /* by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error; + xfs_agblock_t fbno; + xfs_extlen_t flen; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_ag_vextent_small"; +#endif + int i; + + if ((error = xfs_alloc_decrement(ccur, 0, &i))) + goto error0; + if (i) { + if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + /* + * Nothing in the btree, try the freelist. Make sure + * to respect minleft even when pulling from the + * freelist. + */ + else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount, + ARCH_CONVERT) > args->minleft)) { + if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) + goto error0; + if (fbno != NULLAGBLOCK) { + if (args->userdata) { + xfs_buf_t *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); + xfs_trans_binval(args->tp, bp); + } + args->len = 1; + args->agbno = fbno; + XFS_WANT_CORRUPTED_GOTO( + args->agbno + args->len <= + INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length, + ARCH_CONVERT), + error0); + args->wasfromfl = 1; + TRACE_ALLOC("freelist", args); + *stat = 0; + return 0; + } + /* + * Nothing in the freelist. + */ + else + flen = 0; + } + /* + * Can't allocate from the freelist for some reason. + */ + else + flen = 0; + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + TRACE_ALLOC("notenough", args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + TRACE_ALLOC("normal", args); + return 0; + +error0: + TRACE_ALLOC("error", args); + return error; +} + +/* + * Free the extent starting at agno/bno for length. + */ +STATIC int /* error */ +xfs_free_ag_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t bno, /* starting block number */ + xfs_extlen_t len, /* length of extent */ + int isfl) /* set if is freelist blocks - no sb acctg */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ + int error; /* error return value */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_free_ag_extent"; +#endif + xfs_agblock_t gtbno; /* start of right neighbor block */ + xfs_extlen_t gtlen; /* length of right neighbor block */ + int haveleft; /* have a left neighbor block */ + int haveright; /* have a right neighbor block */ + int i; /* temp, result code */ + xfs_agblock_t ltbno; /* start of left neighbor block */ + xfs_extlen_t ltlen; /* length of left neighbor block */ + xfs_mount_t *mp; /* mount point struct for filesystem */ + xfs_agblock_t nbno; /* new starting block of freespace */ + xfs_extlen_t nlen; /* new length of freespace */ + + mp = tp->t_mountp; + /* + * Allocate and initialize a cursor for the by-block btree. + */ + bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL, + 0); + cnt_cur = NULL; + /* + * Look for a neighboring block on the left (lower block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) + goto error0; + if (haveleft) { + /* + * There is a block to our left. + */ + if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (ltbno + ltlen < bno) + haveleft = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + } + } + /* + * Look for a neighboring block on the right (higher block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_increment(bno_cur, 0, &haveright))) + goto error0; + if (haveright) { + /* + * There is a block to our right. + */ + if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * It's not contiguous, though. + */ + if (bno + len < gtbno) + haveright = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + } + } + /* + * Now allocate and initialize a cursor for the by-size tree. + */ + cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL, + 0); + /* + * Have both left and right contiguous neighbors. + * Merge all three into a single free block. + */ + if (haveleft && haveright) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Delete the old by-block entry for the right block. + */ + if ((error = xfs_alloc_delete(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Move the by-block cursor back to the left neighbor. + */ + if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); +#ifdef DEBUG + /* + * Check that this is the right record: delete didn't + * mangle the cursor. + */ + { + xfs_agblock_t xxbno; + xfs_extlen_t xxlen; + + if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO( + i == 1 && xxbno == ltbno && xxlen == ltlen, + error0); + } +#endif + /* + * Update remaining by-block entry to the new, joined block. + */ + nbno = ltbno; + nlen = len + ltlen + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a left contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveleft) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Back up the by-block cursor to the left neighbor, and + * update its length. + */ + if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + nbno = ltbno; + nlen = len + ltlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a right contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveright) { + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Update the starting block and length of the right + * neighbor in the by-block tree. + */ + nbno = bno; + nlen = len + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * No contiguous neighbors. + * Insert the new freespace into the by-block tree. + */ + else { + nbno = bno; + nlen = len; + if ((error = xfs_alloc_insert(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + bno_cur = NULL; + /* + * In all cases we need to insert the new freespace in the by-size tree. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + if ((error = xfs_alloc_insert(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + cnt_cur = NULL; + /* + * Update the freespace totals in the ag and superblock. + */ + { + xfs_agf_t *agf; + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + pag = &mp->m_perag[agno]; + INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len); + xfs_trans_agblocks_delta(tp, len); + pag->pagf_freeblks += len; + XFS_WANT_CORRUPTED_GOTO( + INT_GET(agf->agf_freeblks, ARCH_CONVERT) + <= INT_GET(agf->agf_length, ARCH_CONVERT), + error0); + TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xs_freex); + XFS_STATS_ADD(xs_freeb, len); + } + TRACE_FREE(haveleft ? + (haveright ? "both" : "left") : + (haveright ? "right" : "none"), + agno, bno, len, isfl); + + /* + * Since blocks move to the free list without the coordination + * used in xfs_bmap_finish, we can't allow block to be available + * for reallocation and non-transaction writing (user data) + * until we know that the transaction that moved it to the free + * list is permanently on disk. We track the blocks by declaring + * these blocks as "busy"; the busy list is maintained on a per-ag + * basis and each transaction records which entries should be removed + * when the iclog commits to disk. If a busy block is allocated, + * the iclog is pushed up to the LSN that freed the block. + */ + xfs_alloc_mark_busy(tp, agno, bno, len); + return 0; + + error0: + TRACE_FREE("error", agno, bno, len, isfl); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Visible (exported) allocation/free functions. + * Some of these are used just by xfs_alloc_btree.c and this file. + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_ag_maxlevels = level; +} + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + */ +STATIC int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ +{ + xfs_buf_t *agbp; /* agf buffer pointer */ + xfs_agf_t *agf; /* a.g. freespace structure pointer */ + xfs_buf_t *agflbp;/* agfl buffer pointer */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t delta; /* new blocks needed in freelist */ + int error; /* error result code */ + xfs_extlen_t longest;/* longest extent in allocation group */ + xfs_mount_t *mp; /* file system mount point structure */ + xfs_extlen_t need; /* total blocks needed in freelist */ + xfs_perag_t *pag; /* per-ag information structure */ + xfs_alloc_arg_t targs; /* local allocation arguments */ + xfs_trans_t *tp; /* transaction pointer */ + + mp = args->mp; + + pag = args->pag; + tp = args->tp; + if (!pag->pagf_init) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (!pag->pagf_init) { + args->agbp = NULL; + return 0; + } + } else + agbp = NULL; + + /* If this is a metadata prefered pag and we are user data + * then try somewhere else if we are not being asked to + * try harder at this point + */ + if (pag->pagf_metadata && args->userdata && flags) { + args->agbp = NULL; + return 0; + } + + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + if (args->minlen + args->alignment + args->minalignslop - 1 > longest || + (args->minleft && + (int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < + (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + /* + * Get the a.g. freespace buffer. + * Can fail if we're not blocking on locks, and it's held. + */ + if (agbp == NULL) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (agbp == NULL) { + args->agbp = NULL; + return 0; + } + } + /* + * Figure out how many blocks we should have in the freelist. + */ + agf = XFS_BUF_TO_AGF(agbp); + need = XFS_MIN_FREELIST(agf, mp); + delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ? + (need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0; + /* + * If there isn't enough total or single-extent, reject it. + */ + longest = INT_GET(agf->agf_longest, ARCH_CONVERT); + longest = (longest > delta) ? (longest - delta) : + (INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0); + if (args->minlen + args->alignment + args->minalignslop - 1 > longest || + (args->minleft && + (int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) + + INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + /* + * Make the freelist shorter if it's too long. + */ + while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) { + xfs_buf_t *bp; + + if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) + return error; + if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) + return error; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + xfs_trans_binval(tp, bp); + } + /* + * Initialize the args structure. + */ + targs.tp = tp; + targs.mp = mp; + targs.agbp = agbp; + targs.agno = args->agno; + targs.mod = targs.minleft = targs.wasdel = targs.userdata = + targs.minalignslop = 0; + targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.type = XFS_ALLOCTYPE_THIS_AG; + targs.pag = pag; + if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) + return error; + /* + * Make the freelist longer if it's too short. + */ + while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) { + targs.agbno = 0; + targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT); + /* + * Allocate as many blocks as possible at once. + */ + if ((error = xfs_alloc_ag_vextent(&targs))) + return error; + /* + * Stop if we run out. Won't happen if callers are obeying + * the restrictions correctly. Can happen for free calls + * on a completely full ag. + */ + if (targs.agbno == NULLAGBLOCK) + break; + /* + * Put each allocated block on the list. + */ + for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { + if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, + bno))) + return error; + } + } + args->agbp = agbp; + return 0; +} + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop) /* block address retrieved from freelist */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. freelist structure */ + xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ + xfs_agblock_t bno; /* block number returned */ + int error; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_get_freelist"; +#endif + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + /* + * Freelist is empty, give up. + */ + if (!agf->agf_flcount) { + *bnop = NULLAGBLOCK; + return 0; + } + /* + * Read the array of free blocks. + */ + mp = tp->t_mountp; + if ((error = xfs_alloc_read_agfl(mp, tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + /* + * Get the block number and update the data structures. + */ + bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT); + INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1); + xfs_trans_brelse(tp, agflbp); + if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE(mp)) + agf->agf_flfirst = 0; + pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)]; + INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1); + xfs_trans_agflist_delta(tp, -1); + pag->pagf_flcount--; + TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + *bnop = bno; + + /* + * As blocks are freed, they are added to the per-ag busy list + * and remain there until the freeing transaction is committed to + * disk. Now that we have allocated blocks, this list must be + * searched to see if a block is being reused. If one is, then + * the freeing transaction must be pushed to disk NOW by forcing + * to disk all iclogs up that transaction's LSN. + */ + xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1); + return 0; +} + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer for a.g. freelist header */ + int fields) /* mask of fields to be logged (XFS_AGF_...) */ +{ + int first; /* first byte offset */ + int last; /* last byte offset */ + static const short offsets[] = { + offsetof(xfs_agf_t, agf_magicnum), + offsetof(xfs_agf_t, agf_versionnum), + offsetof(xfs_agf_t, agf_seqno), + offsetof(xfs_agf_t, agf_length), + offsetof(xfs_agf_t, agf_roots[0]), + offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_flfirst), + offsetof(xfs_agf_t, agf_fllast), + offsetof(xfs_agf_t, agf_flcount), + offsetof(xfs_agf_t, agf_freeblks), + offsetof(xfs_agf_t, agf_longest), + sizeof(xfs_agf_t) + }; + + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); +} + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags) /* XFS_ALLOC_FLAGS_... */ +{ + xfs_buf_t *bp; + int error; + + if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_buf_t *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno) /* block being freed */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_agfl_t *agfl; /* a.g. free block array */ + xfs_agblock_t *blockp;/* pointer to array entry */ + int error; +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_put_freelist"; +#endif + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + + agf = XFS_BUF_TO_AGF(agbp); + mp = tp->t_mountp; + + if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp))) + return error; + agfl = XFS_BUF_TO_AGFL(agflbp); + INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1); + if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE(mp)) + agf->agf_fllast = 0; + pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)]; + INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1); + xfs_trans_agflist_delta(tp, 1); + pag->pagf_flcount++; + ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp)); + blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)]; + INT_SET(*blockp, ARCH_CONVERT, bno); + TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); + xfs_trans_log_buf(tp, agflbp, + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), + (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + + sizeof(xfs_agblock_t) - 1)); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + xfs_buf_t **bpp) /* buffer for the ag freelist header */ +{ + xfs_agf_t *agf; /* ag freelist header */ + int agf_ok; /* set if agf is consistent */ + xfs_buf_t *bp; /* return value */ + xfs_perag_t *pag; /* per allocation group data */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U, + &bp); + if (error) + return error; + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (!bp) { + *bpp = NULL; + return 0; + } + /* + * Validate the magic number of the agf block. + */ + agf = XFS_BUF_TO_AGF(bp); + agf_ok = + INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC && + XFS_AGF_GOOD_VERSION( + INT_GET(agf->agf_versionnum, ARCH_CONVERT)) && + INT_GET(agf->agf_freeblks, ARCH_CONVERT) <= + INT_GET(agf->agf_length, ARCH_CONVERT) && + INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE(mp) && + INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE(mp) && + INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp); + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", + XFS_ERRLEVEL_LOW, mp, agf); + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + pag = &mp->m_perag[agno]; + if (!pag->pagf_init) { + pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT); + pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT); + pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT); + pag->pagf_levels[XFS_BTNUM_BNOi] = + INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT); + pag->pagf_levels[XFS_BTNUM_CNTi] = + INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT); + spinlock_init(&pag->pagb_lock, "xfspagb"); + pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * + sizeof(xfs_perag_busy_t), KM_SLEEP); + pag->pagf_init = 1; + } +#ifdef DEBUG + else if (!XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT)); + ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT)); + ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT)); + ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == + INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT)); + ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == + INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT)); + } +#endif + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF); + *bpp = bp; + return 0; +} + +/* + * Allocate an extent (variable-size). + * Depending on the allocation type, we either look in a single allocation + * group or loop over the allocation groups to find the result. + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agblock_t agsize; /* allocation group size */ + int error; + int flags; /* XFS_ALLOC_FLAG_... locking flags */ +#ifdef XFS_ALLOC_TRACE + static char fname[] = "xfs_alloc_vextent"; +#endif + xfs_extlen_t minleft;/* minimum left value, temp copy */ + xfs_mount_t *mp; /* mount structure pointer */ + xfs_agnumber_t sagno; /* starting allocation group number */ + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + int no_min = 0; + xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure + * that out (xfs_bmap_alloc). + */ + agsize = mp->m_sb.sb_agblocks; + if (args->maxlen > agsize) + args->maxlen = agsize; + if (args->alignment == 0) + args->alignment = 1; + ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->minlen <= agsize); + ASSERT(args->mod < args->prod); + if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + args->minlen > args->maxlen || args->minlen > agsize || + args->mod >= args->prod) { + args->fsbno = NULLFSBLOCK; + TRACE_ALLOC("badargs", args); + return 0; + } + minleft = args->minleft; + + switch (type) { + case XFS_ALLOCTYPE_THIS_AG: + case XFS_ALLOCTYPE_NEAR_BNO: + case XFS_ALLOCTYPE_THIS_BNO: + /* + * These three force us into a single a.g. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + down_read(&mp->m_peraglock); + args->pag = &mp->m_perag[args->agno]; + args->minleft = 0; + error = xfs_alloc_fix_freelist(args, 0); + args->minleft = minleft; + if (error) { + TRACE_ALLOC("nofix", args); + goto error0; + } + if (!args->agbp) { + up_read(&mp->m_peraglock); + TRACE_ALLOC("noagbp", args); + break; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + up_read(&mp->m_peraglock); + break; + case XFS_ALLOCTYPE_START_BNO: + /* + * Try near allocation first, then anywhere-in-ag after + * the first a.g. fails. + */ + if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + (mp->m_flags & XFS_MOUNT_32BITINODES)) { + args->fsbno = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + /* FALLTHROUGH */ + case XFS_ALLOCTYPE_ANY_AG: + case XFS_ALLOCTYPE_START_AG: + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. + */ + if (type == XFS_ALLOCTYPE_ANY_AG) { + /* + * Start with the last place we left off. + */ + args->agno = sagno = (mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount; + args->type = XFS_ALLOCTYPE_THIS_AG; + flags = XFS_ALLOC_FLAG_TRYLOCK; + } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* + * Start with allocation group given by bno. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; + sagno = 0; + flags = 0; + } else { + if (type == XFS_ALLOCTYPE_START_AG) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * Start with the given allocation group. + */ + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. + */ + down_read(&mp->m_peraglock); + for (;;) { + args->pag = &mp->m_perag[args->agno]; + if (no_min) args->minleft = 0; + error = xfs_alloc_fix_freelist(args, flags); + args->minleft = minleft; + if (error) { + TRACE_ALLOC("nofix", args); + goto error0; + } + /* + * If we get a buffer back then the allocation will fly. + */ + if (args->agbp) { + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + } + TRACE_ALLOC("loopfailed", args); + /* + * Didn't work, figure out the next iteration. + */ + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; + if (++(args->agno) == mp->m_sb.sb_agcount) + args->agno = 0; + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. + */ + if (args->agno == sagno) { + if (no_min == 1) { + args->agbno = NULLAGBLOCK; + TRACE_ALLOC("allfailed", args); + break; + } + if (flags == 0) { + no_min = 1; + } else { + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + } + } + } + } + up_read(&mp->m_peraglock); + if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (args->agno == sagno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (args->agbno == NULLAGBLOCK) + args->fsbno = NULLFSBLOCK; + else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); +#ifdef DEBUG + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), + args->len); +#endif + } + return 0; +error0: + up_read(&mp->m_peraglock); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ +#ifdef DEBUG + xfs_agf_t *agf; /* a.g. freespace header */ +#endif + xfs_alloc_arg_t args; /* allocation argument structure */ + int error; + + ASSERT(len != 0); + args.tp = tp; + args.mp = tp->t_mountp; + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + ASSERT(args.agno < args.mp->m_sb.sb_agcount); + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + args.alignment = 1; + args.minlen = args.minleft = args.minalignslop = 0; + down_read(&args.mp->m_peraglock); + args.pag = &args.mp->m_perag[args.agno]; + if ((error = xfs_alloc_fix_freelist(&args, 0))) + goto error0; +#ifdef DEBUG + ASSERT(args.agbp != NULL); + agf = XFS_BUF_TO_AGF(args.agbp); + ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT)); +#endif + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, + len, 0); +error0: + up_read(&args.mp->m_peraglock); + return error; +} + + +/* + * AG Busy list management + * The busy list contains block ranges that have been freed but whose + * transacations have not yet hit disk. If any block listed in a busy + * list is reused, the transaction that freed it must be forced to disk + * before continuing to use the block. + * + * xfs_alloc_mark_busy - add to the per-ag busy list + * xfs_alloc_clear_busy - remove an item from the per-ag busy list + */ +void +xfs_alloc_mark_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *bsy; + int n; + SPLDECL(s); + + mp = tp->t_mountp; + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + + /* search pagb_list for an open slot */ + for (bsy = mp->m_perag[agno].pagb_list, n = 0; + n < XFS_PAGB_NUM_SLOTS; + bsy++, n++) { + if (bsy->busy_tp == NULL) { + break; + } + } + + if (n < XFS_PAGB_NUM_SLOTS) { + bsy = &mp->m_perag[agno].pagb_list[n]; + mp->m_perag[agno].pagb_count++; + TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp); + bsy->busy_start = bno; + bsy->busy_length = len; + bsy->busy_tp = tp; + xfs_trans_add_busy(tp, agno, n); + } else { + TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp); + /* + * The busy list is full! Since it is now not possible to + * track the free block, make this a synchronous transaction + * to insure that the block is not reused before this + * transaction commits. + */ + xfs_trans_set_sync(tp); + } + + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); +} + +void +xfs_alloc_clear_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + int idx) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *list; + SPLDECL(s); + + mp = tp->t_mountp; + + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + list = mp->m_perag[agno].pagb_list; + + ASSERT(idx < XFS_PAGB_NUM_SLOTS); + if (list[idx].busy_tp == tp) { + TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp); + list[idx].busy_tp = NULL; + mp->m_perag[agno].pagb_count--; + } else { + TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp); + } + + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); +} + + +/* + * returns non-zero if any of (agno,bno):len is in a busy list + */ +int +xfs_alloc_search_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + xfs_mount_t *mp; + xfs_perag_busy_t *bsy; + int n; + xfs_agblock_t uend, bend; + xfs_lsn_t lsn; + int cnt; + SPLDECL(s); + + mp = tp->t_mountp; + + s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); + cnt = mp->m_perag[agno].pagb_count; + + uend = bno + len - 1; + + /* search pagb_list for this slot, skipping open slots */ + for (bsy = mp->m_perag[agno].pagb_list, n = 0; + cnt; bsy++, n++) { + + /* + * (start1,length1) within (start2, length2) + */ + if (bsy->busy_tp != NULL) { + bend = bsy->busy_start + bsy->busy_length - 1; + if ((bno > bend) || + (uend < bsy->busy_start)) { + cnt--; + } else { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", + "found1", agno, bno, len, n, + tp); + break; + } + } + } + + /* + * If a block was found, force the log through the LSN of the + * transaction that freed the block + */ + if (cnt) { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); + lsn = bsy->busy_tp->t_commit_lsn; + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + } else { + TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); + n = -1; + mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + } + + return n; +} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h new file mode 100644 index 00000000000..72329c86351 --- /dev/null +++ b/fs/xfs/xfs_alloc.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ALLOC_H__ +#define __XFS_ALLOC_H__ + +struct xfs_buf; +struct xfs_mount; +struct xfs_perag; +struct xfs_trans; + +/* + * Freespace allocation types. Argument to xfs_alloc_[v]extent. + */ +typedef enum xfs_alloctype +{ + XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ + XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ + XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ + XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ + XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ + XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ + XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ +} xfs_alloctype_t; + +/* + * Flags for xfs_alloc_fix_freelist. + */ +#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ + +/* + * Argument structure for xfs_alloc routines. + * This is turned into a structure to avoid having 20 arguments passed + * down several levels of the stack. + */ +typedef struct xfs_alloc_arg { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for a.g. freelist header */ + struct xfs_perag *pag; /* per-ag struct for this agno */ + xfs_fsblock_t fsbno; /* file system block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; /* allocation group-relative block # */ + xfs_extlen_t minlen; /* minimum size of extent */ + xfs_extlen_t maxlen; /* maximum size of extent */ + xfs_extlen_t mod; /* mod value for extent size */ + xfs_extlen_t prod; /* prod value for extent size */ + xfs_extlen_t minleft; /* min blocks must be left after us */ + xfs_extlen_t total; /* total blocks needed in xaction */ + xfs_extlen_t alignment; /* align answer to multiple of this */ + xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_extlen_t len; /* output: actual size of extent */ + xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ + xfs_alloctype_t otype; /* original allocation type */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ + char isfl; /* set if is freelist blocks - !actg */ + char userdata; /* set if this is user data */ +} xfs_alloc_arg_t; + +/* + * Defines for userdata + */ +#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ + + +#ifdef __KERNEL__ + +#if defined(XFS_ALLOC_TRACE) +/* + * Allocation tracing buffer size. + */ +#define XFS_ALLOC_TRACE_SIZE 4096 +extern ktrace_t *xfs_alloc_trace_buf; + +/* + * Types for alloc tracing. + */ +#define XFS_ALLOC_KTRACE_ALLOC 1 +#define XFS_ALLOC_KTRACE_FREE 2 +#define XFS_ALLOC_KTRACE_MODAGF 3 +#define XFS_ALLOC_KTRACE_BUSY 4 +#define XFS_ALLOC_KTRACE_UNBUSY 5 +#define XFS_ALLOC_KTRACE_BUSYSEARCH 6 +#endif + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop); /* block address retrieved from freelist */ + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* buffer for a.g. freelist header */ + int fields);/* mask of fields to be logged (XFS_AGF_...) */ + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags); /* XFS_ALLOC_FLAGS_... */ + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for a.g. freelist header */ + struct xfs_buf *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno); /* block being freed */ + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp); /* buffer for the ag freelist header */ + +/* + * Allocate an extent (variable-size). + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args); /* allocation argument structure */ + +/* + * Free an extent. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len); /* length of extent */ + +void +xfs_alloc_mark_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len); + +void +xfs_alloc_clear_busy(xfs_trans_t *tp, + xfs_agnumber_t ag, + int idx); + + +#endif /* __KERNEL__ */ + +#endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c new file mode 100644 index 00000000000..e0355a12d94 --- /dev/null +++ b/fs/xfs/xfs_alloc_btree.c @@ -0,0 +1,2204 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Free space allocation for XFS. + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" + +/* + * Prototypes for internal functions. + */ + +STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int); +STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *); +STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *, + xfs_alloc_key_t *, xfs_btree_cur_t **, int *); +STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int); + +/* + * Internal functions. + */ + +/* + * Single level of the xfs_alloc_delete record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_alloc_delrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + xfs_agf_t *agf; /* allocation group freelist header */ + xfs_alloc_block_t *block; /* btree block record/key lives in */ + xfs_agblock_t bno; /* btree block number */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* kp points here if block is level 0 */ + xfs_agblock_t lbno; /* left block's block number */ + xfs_buf_t *lbp; /* left block's buffer pointer */ + xfs_alloc_block_t *left; /* left btree block */ + xfs_alloc_key_t *lkp=NULL; /* left block key pointer */ + xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */ + int lrecs=0; /* number of records in left block */ + xfs_alloc_rec_t *lrp; /* left block record pointer */ + xfs_mount_t *mp; /* mount structure */ + int ptr; /* index in btree block for this rec */ + xfs_agblock_t rbno; /* right block's block number */ + xfs_buf_t *rbp; /* right block's buffer pointer */ + xfs_alloc_block_t *right; /* right btree block */ + xfs_alloc_key_t *rkp; /* right block key pointer */ + xfs_alloc_ptr_t *rpp; /* right block address pointer */ + int rrecs=0; /* number of records in right block */ + xfs_alloc_rec_t *rrp; /* right block record pointer */ + xfs_btree_cur_t *tcur; /* temporary btree cursor */ + + /* + * Get the index of the entry being deleted, check for nothing there. + */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + *stat = 0; + return 0; + } + /* + * Get the buffer & block containing the record or key/ptr. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Fail if we're off the end of the block. + */ + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + *stat = 0; + return 0; + } + XFS_STATS_INC(xs_abt_delrec); + /* + * It's a nonleaf. Excise the key and ptr being deleted, by + * sliding the entries past them down one. + * Log the changed areas of the block. + */ + if (level > 0) { + lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + memmove(&lkp[ptr - 1], &lkp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */ + memmove(&lpp[ptr - 1], &lpp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */ + xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + } + } + /* + * It's a leaf. Excise the record being deleted, by sliding the + * entries past it down one. Log the changed areas of the block. + */ + else { + lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + memmove(&lrp[ptr - 1], &lrp[ptr], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + } + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */ + lkp = &key; + } + } + /* + * Decrement and log the number of entries in the block. + */ + INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); + /* + * See if the longest free extent in the allocation group was + * changed by this operation. True if it's the by-size btree, and + * this is the leaf level, and there is no right sibling block, + * and this was the last record. + */ + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + mp = cur->bc_mp; + + if (level == 0 && + cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1); + /* + * There are still records in the block. Grab the size + * from the last one. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur); + INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT); + } + /* + * No free extents left. + */ + else + agf->agf_longest = 0; + mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest = + INT_GET(agf->agf_longest, ARCH_CONVERT); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Is this the root level? If so, we're almost done. + */ + if (level == cur->bc_nlevels - 1) { + /* + * If this is the root level, + * and there's only one entry left, + * and it's NOT the leaf level, + * then we can get rid of this level. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) { + /* + * lpp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT); + INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT); + INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1); + mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--; + /* + * Put this buffer/block on the ag's freelist. + */ + if ((error = xfs_alloc_put_freelist(cur->bc_tp, + cur->bc_private.a.agbp, NULL, bno))) + return error; + /* + * Since blocks move to the free list without the + * coordination used in xfs_bmap_finish, we can't allow + * block to be available for reallocation and + * non-transaction writing (user data) until we know + * that the transaction that moved it to the free list + * is permanently on disk. We track the blocks by + * declaring these blocks as "busy"; the busy list is + * maintained on a per-ag basis and each transaction + * records which entries should be removed when the + * iclog commits to disk. If a busy block is + * allocated, the iclog is pushed up to the LSN + * that freed the block. + */ + xfs_alloc_mark_busy(cur->bc_tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1); + + xfs_trans_agbtree_delta(cur->bc_tp, -1); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_ROOTS | XFS_AGF_LEVELS); + /* + * Update the cursor so there's one fewer level. + */ + xfs_btree_setbuf(cur, level, NULL); + cur->bc_nlevels--; + } else if (level > 0 && + (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1))) + return error; + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT); + bno = NULLAGBLOCK; + ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (rbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_increment(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Grab a pointer to the block. + */ + rbp = tcur->bc_bufs[level]; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_alloc_lshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level > 0 && + (error = xfs_alloc_decrement(cur, level, + &i))) + return error; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (lbno != NULLAGBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_decrement(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (lbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_decrement(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_btree_firstrec(tcur, level); + /* + * Grab a pointer to the block. + */ + lbp = tcur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_alloc_rshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_ALLOC_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level == 0) + cur->bc_ptrs[0]++; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * Delete the temp cursor, we're done with it. + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + /* + * If here, we need to do a join to keep the tree balanced. + */ + ASSERT(bno != NULLAGBLOCK); + /* + * See if we can join with the left neighbor block. + */ + if (lbno != NULLAGBLOCK && + lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rbno = bno; + right = block; + rbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, lbno, 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + } + /* + * If that won't work, see if we can join with the right neighbor block. + */ + else if (rbno != NULLAGBLOCK && + rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lbno = bno; + left = block; + lbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, rbno, 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + } + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + else { + if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + if (level > 0) { + /* + * It's a non-leaf. Move keys and pointers. + */ + lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memcpy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */ + memcpy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */ + xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } else { + /* + * It's a leaf. Move records. + */ + lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + memcpy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, + INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + xfs_btree_setbuf(cur, level, lbp); + cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if (level + 1 < cur->bc_nlevels && + (error = xfs_alloc_increment(cur, level + 1, &i))) + return error; + /* + * Fix up the number of records in the surviving block. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + /* + * Fix up the right block pointer in the surviving block, and log it. + */ + left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */ + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there is a right sibling now, make it point to the + * remaining block. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_alloc_block_t *rrblock; + xfs_buf_t *rrbp; + + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, + &rrbp, XFS_ALLOC_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); + xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * Free the deleting block by putting it on the freelist. + */ + if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, + NULL, rbno))) + return error; + /* + * Since blocks move to the free list without the coordination + * used in xfs_bmap_finish, we can't allow block to be available + * for reallocation and non-transaction writing (user data) + * until we know that the transaction that moved it to the free + * list is permanently on disk. We track the blocks by declaring + * these blocks as "busy"; the busy list is maintained on a + * per-ag basis and each transaction records which entries + * should be removed when the iclog commits to disk. If a + * busy block is allocated, the iclog is pushed up to the + * LSN that freed the block. + */ + xfs_alloc_mark_busy(cur->bc_tp, + INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1); + + xfs_trans_agbtree_delta(cur->bc_tp, -1); + /* + * Adjust the current level's cursor so that we're left referring + * to the right node, after we're done. + * If this leaves the ptr value 0 our caller will fix it up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + /* + * Return value means the next level up has something to do. + */ + *stat = 2; + return 0; + +error0: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int /* error */ +xfs_alloc_insrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to insert record at */ + xfs_agblock_t *bnop, /* i/o: block number inserted */ + xfs_alloc_rec_t *recp, /* i/o: record data inserted */ + xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ + int *stat) /* output: success/failure */ +{ + xfs_agf_t *agf; /* allocation group freelist header */ + xfs_alloc_block_t *block; /* btree block record/key lives in */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* key value being inserted */ + xfs_alloc_key_t *kp; /* pointer to btree keys */ + xfs_agblock_t nbno; /* block number of allocated block */ + xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ + xfs_alloc_key_t nkey; /* new key value, from split */ + xfs_alloc_rec_t nrec; /* new record value, for caller */ + int optr; /* old ptr value */ + xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ + int ptr; /* index in btree block for this rec */ + xfs_alloc_rec_t *rp; /* pointer to btree records */ + + ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0); + /* + * If we made it to the root level, allocate a new root block + * and we're done. + */ + if (level >= cur->bc_nlevels) { + XFS_STATS_INC(xs_abt_insrec); + if ((error = xfs_alloc_newroot(cur, &i))) + return error; + *bnop = NULLAGBLOCK; + *stat = i; + return 0; + } + /* + * Make a key out of the record data to be inserted, and save it. + */ + key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */ + optr = ptr = cur->bc_ptrs[level]; + /* + * If we're off the left edge, return failure. + */ + if (ptr == 0) { + *stat = 0; + return 0; + } + XFS_STATS_INC(xs_abt_insrec); + /* + * Get pointers to the btree buffer and block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; + /* + * Check that the new entry is being inserted in the right place. + */ + if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (level == 0) { + rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); + xfs_btree_check_rec(cur->bc_btnum, recp, rp); + } else { + kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); + xfs_btree_check_key(cur->bc_btnum, &key, kp); + } + } +#endif + nbno = NULLAGBLOCK; + ncur = (xfs_btree_cur_t *)0; + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + /* + * First, try shifting an entry to the right neighbor. + */ + if ((error = xfs_alloc_rshift(cur, level, &i))) + return error; + if (i) { + /* nothing */ + } + /* + * Next, try shifting an entry to the left neighbor. + */ + else { + if ((error = xfs_alloc_lshift(cur, level, &i))) + return error; + if (i) + optr = ptr = cur->bc_ptrs[level]; + else { + /* + * Next, try splitting the current block in + * half. If this works we have to re-set our + * variables because we could be in a + * different block now. + */ + if ((error = xfs_alloc_split(cur, level, &nbno, + &nkey, &ncur, &i))) + return error; + if (i) { + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = + xfs_btree_check_sblock(cur, + block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */ + nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */ + } + /* + * Otherwise the insert fails. + */ + else { + *stat = 0; + return 0; + } + } + } + } + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + if (level > 0) { + /* + * It's a non-leaf entry. Make a hole for the new data + * in the key and ptr regions of the block. + */ + kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level))) + return error; + } +#endif + memmove(&kp[ptr], &kp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */ + memmove(&pp[ptr], &pp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */ +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, *bnop, level))) + return error; +#endif + /* + * Now stuff the new data in, bump numrecs and log the new data. + */ + kp[ptr - 1] = key; + INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); +#ifdef DEBUG + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) + xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, + kp + ptr); +#endif + } else { + /* + * It's a leaf entry. Make a hole for the new record. + */ + rp = XFS_ALLOC_REC_ADDR(block, 1, cur); + memmove(&rp[ptr], &rp[ptr - 1], + (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp)); + /* + * Now stuff the new record in, bump numrecs + * and log the new data. + */ + rp[ptr - 1] = *recp; /* INT_: struct copy */ + INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); +#ifdef DEBUG + if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) + xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, + rp + ptr); +#endif + } + /* + * Log the new number of records in the btree header. + */ + xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); + /* + * If we inserted at the start of a block, update the parents' keys. + */ + if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1))) + return error; + /* + * Look to see if the longest extent in the allocation group + * needs to be updated. + */ + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + if (level == 0 && + cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) { + /* + * If this is a leaf in the by-size btree and there + * is no right sibling block and this block is bigger + * than the previous longest block, update it. + */ + INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT); + cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest + = INT_GET(recp->ar_blockcount, ARCH_CONVERT); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *bnop = nbno; + if (nbno != NULLAGBLOCK) { + *recp = nrec; /* INT_: struct copy */ + *curp = ncur; /* INT_: struct copy */ + } + *stat = 1; + return 0; +} + +/* + * Log header fields from a btree block. + */ +STATIC void +xfs_alloc_log_block( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short offsets[] = { /* table of offsets */ + offsetof(xfs_alloc_block_t, bb_magic), + offsetof(xfs_alloc_block_t, bb_level), + offsetof(xfs_alloc_block_t, bb_numrecs), + offsetof(xfs_alloc_block_t, bb_leftsib), + offsetof(xfs_alloc_block_t, bb_rightsib), + sizeof(xfs_alloc_block_t) + }; + + xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * Log keys from a btree block (nonleaf). + */ +STATIC void +xfs_alloc_log_keys( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int kfirst, /* index of first key to log */ + int klast) /* index of last key to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + xfs_alloc_key_t *kp; /* key pointer in btree block */ + int last; /* last byte offset logged */ + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_alloc_log_ptrs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int pfirst, /* index of first pointer to log */ + int plast) /* index of last pointer to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */ + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log records from a btree block (leaf). + */ +STATIC void +xfs_alloc_log_recs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int rfirst, /* index of first record to log */ + int rlast) /* index of last record to log */ +{ + xfs_alloc_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_alloc_rec_t *rp; /* record pointer for btree block */ + + + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + rp = XFS_ALLOC_REC_ADDR(block, 1, cur); +#ifdef DEBUG + { + xfs_agf_t *agf; + xfs_alloc_rec_t *p; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++) + ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <= + INT_GET(agf->agf_length, ARCH_CONVERT)); + } +#endif + first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * Return 0 if can't find any such record, 1 for success. + */ +STATIC int /* error */ +xfs_alloc_lookup( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + xfs_agblock_t agbno; /* a.g. relative btree block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_alloc_block_t *block=NULL; /* current btree block */ + int diff; /* difference for the current key */ + int error; /* error return value */ + int keyno=0; /* current key number */ + int level; /* level in the btree */ + xfs_mount_t *mp; /* file system mount point */ + + XFS_STATS_INC(xs_abt_lookup); + /* + * Get the allocation group header, and the root block number. + */ + mp = cur->bc_mp; + + { + xfs_agf_t *agf; /* a.g. freespace header */ + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + agno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT); + } + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + xfs_buf_t *bp; /* buffer pointer for btree block */ + xfs_daddr_t d; /* disk address of btree block */ + + /* + * Get the disk address we're looking for. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + /* + * If the old buffer at this level is for a different block, + * throw it away, otherwise just use it. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) != d) + bp = (xfs_buf_t *)0; + if (!bp) { + /* + * Need to get a new buffer. Read it, then + * set it in the cursor, releasing the old one. + */ + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno, + agbno, 0, &bp, XFS_ALLOC_BTREE_REF))) + return error; + xfs_btree_setbuf(cur, level, bp); + /* + * Point to the btree block, now that we have the buffer + */ + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, level, + bp))) + return error; + } else + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + /* + * If we already had a key match at a higher level, we know + * we need to use the first entry in this block. + */ + if (diff == 0) + keyno = 1; + /* + * Otherwise we need to search this block. Do a binary search. + */ + else { + int high; /* high entry number */ + xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */ + xfs_alloc_rec_t *krbase=NULL;/* base of records in block */ + int low; /* low entry number */ + + /* + * Get a pointer to keys or records. + */ + if (level > 0) + kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur); + else + krbase = XFS_ALLOC_REC_ADDR(block, 1, cur); + /* + * Set low and high entry numbers, 1-based. + */ + low = 1; + if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) { + /* + * If the block is empty, the tree must + * be an empty leaf. + */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + *stat = 0; + return 0; + } + /* + * Binary search the block. + */ + while (low <= high) { + xfs_extlen_t blockcount; /* key value */ + xfs_agblock_t startblock; /* key value */ + + XFS_STATS_INC(xs_abt_compare); + /* + * keyno is average of low and high. + */ + keyno = (low + high) >> 1; + /* + * Get startblock & blockcount. + */ + if (level > 0) { + xfs_alloc_key_t *kkp; + + kkp = kkbase + keyno - 1; + startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT); + blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT); + } else { + xfs_alloc_rec_t *krp; + + krp = krbase + keyno - 1; + startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT); + blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT); + } + /* + * Compute difference to get next direction. + */ + if (cur->bc_btnum == XFS_BTNUM_BNO) + diff = (int)startblock - + (int)cur->bc_rec.a.ar_startblock; + else if (!(diff = (int)blockcount - + (int)cur->bc_rec.a.ar_blockcount)) + diff = (int)startblock - + (int)cur->bc_rec.a.ar_startblock; + /* + * Less than, move right. + */ + if (diff < 0) + low = keyno + 1; + /* + * Greater than, move left. + */ + else if (diff > 0) + high = keyno - 1; + /* + * Equal, we're done. + */ + else + break; + } + } + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, agbno, level))) + return error; +#endif + cur->bc_ptrs[level] = keyno; + } + } + /* + * Done with the search. + * See if we need to adjust the results. + */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + if (dir == XFS_LOOKUP_GE && + keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) && + INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + int i; + + cur->bc_ptrs[0] = keyno; + if ((error = xfs_alloc_increment(cur, 0, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + *stat = 1; + return 0; + } + } + else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + /* + * Return if we succeeded or not. + */ + if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) + *stat = 0; + else + *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); + return 0; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_alloc_lshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef DEBUG + int i; /* loop index */ +#endif + xfs_alloc_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left neighbor block */ + xfs_alloc_block_t *left; /* left neighbor btree block */ + int nrec; /* new number of left block entries */ + xfs_buf_t *rbp; /* buffer for right (current) block */ + xfs_alloc_block_t *right; /* right (current) btree block */ + xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */ + xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */ + xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */ + + /* + * Set up variables for this block as "right". + */ + rbp = cur->bc_bufs[level]; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; +#endif + /* + * If we've got no left sibling then we can't shift an entry left. + */ + if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) { + *stat = 0; + return 0; + } + /* + * Set up the left neighbor as "left". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1; + /* + * If non-leaf, copy a key and a ptr to the left block. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* key pointer for left block */ + xfs_alloc_ptr_t *lpp; /* address pointer for left block */ + + lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + *lkp = *rkp; + xfs_alloc_log_keys(cur, lbp, nrec, nrec); + lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) + return error; +#endif + *lpp = *rpp; /* INT_: copy */ + xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); + xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); + } + /* + * If leaf, copy a record to the left block. + */ + else { + xfs_alloc_rec_t *lrp; /* record pointer for left block */ + + lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + *lrp = *rrp; + xfs_alloc_log_recs(cur, lbp, nrec, nrec); + xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); + } + /* + * Bump and log left's numrecs, decrement and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Slide the contents of right down one entry. + */ + if (level > 0) { +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + level))) + return error; + } +#endif + memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } else { + memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + rkp = &key; + } + /* + * Update the parent key values of right. + */ + if ((error = xfs_alloc_updkey(cur, rkp, level + 1))) + return error; + /* + * Slide the cursor value left one. + */ + cur->bc_ptrs[level]--; + *stat = 1; + return 0; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_alloc_newroot( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + xfs_agblock_t lbno; /* left block number */ + xfs_buf_t *lbp; /* left btree buffer */ + xfs_alloc_block_t *left; /* left btree block */ + xfs_mount_t *mp; /* mount structure */ + xfs_agblock_t nbno; /* new block number */ + xfs_buf_t *nbp; /* new (root) buffer */ + xfs_alloc_block_t *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + xfs_agblock_t rbno; /* right block number */ + xfs_buf_t *rbp; /* right btree buffer */ + xfs_alloc_block_t *right; /* right btree block */ + + mp = cur->bc_mp; + + ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp)); + /* + * Get a buffer from the freelist blocks, for the new root. + */ + if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &nbno))) + return error; + /* + * None available, we fail. + */ + if (nbno == NULLAGBLOCK) { + *stat = 0; + return 0; + } + xfs_trans_agbtree_delta(cur->bc_tp, 1); + nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno, + 0); + new = XFS_BUF_TO_ALLOC_BLOCK(nbp); + /* + * Set the root data in the a.g. freespace structure. + */ + { + xfs_agf_t *agf; /* a.g. freespace header */ + xfs_agnumber_t seqno; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno); + INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1); + seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++; + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_ROOTS | XFS_AGF_LEVELS); + } + /* + * At the previous root level there are now two blocks: the old + * root, and the new block generated when it was split. + * We don't know which one the cursor is pointing at, so we + * set up variables "left" and "right" for each case. + */ + lbp = cur->bc_bufs[cur->bc_nlevels - 1]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp))) + return error; +#endif + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + /* + * Our block is left, pick up the right block. + */ + lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp)); + rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, rbno, 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, + cur->bc_nlevels - 1, rbp))) + return error; + nptr = 1; + } else { + /* + * Our block is right, pick up the left block. + */ + rbp = lbp; + right = left; + rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp)); + lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.a.agno, lbno, 0, &lbp, + XFS_ALLOC_BTREE_REF))) + return error; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, + cur->bc_nlevels - 1, lbp))) + return error; + nptr = 2; + } + /* + * Fill in the new block's btree header and log it. + */ + INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels); + INT_SET(new->bb_numrecs, ARCH_CONVERT, 2); + INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS); + ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); + /* + * Fill in the key data in the new root. + */ + { + xfs_alloc_key_t *kp; /* btree key pointer */ + + kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); + if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) { + kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */ + kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */ + } else { + xfs_alloc_rec_t *rp; /* btree record pointer */ + + rp = XFS_ALLOC_REC_ADDR(left, 1, cur); + kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */ + kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */ + rp = XFS_ALLOC_REC_ADDR(right, 1, cur); + kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */ + kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */ + } + } + xfs_alloc_log_keys(cur, nbp, 1, 2); + /* + * Fill in the pointer data in the new root. + */ + { + xfs_alloc_ptr_t *pp; /* btree address pointer */ + + pp = XFS_ALLOC_PTR_ADDR(new, 1, cur); + INT_SET(pp[0], ARCH_CONVERT, lbno); + INT_SET(pp[1], ARCH_CONVERT, rbno); + } + xfs_alloc_log_ptrs(cur, nbp, 1, 2); + /* + * Fix up the cursor. + */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + *stat = 1; + return 0; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_alloc_rshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* loop index */ + xfs_alloc_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left (current) block */ + xfs_alloc_block_t *left; /* left (current) btree block */ + xfs_buf_t *rbp; /* buffer for right neighbor block */ + xfs_alloc_block_t *right; /* right neighbor btree block */ + xfs_alloc_key_t *rkp; /* key pointer for right block */ + xfs_btree_cur_t *tcur; /* temporary cursor */ + + /* + * Set up variables for this block as "left". + */ + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * If we've got no right sibling then we can't shift an entry right. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) { + *stat = 0; + return 0; + } + /* + * Set up the right neighbor as "right". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp, + XFS_ALLOC_BTREE_REF))) + return error; + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* key pointer for left block */ + xfs_alloc_ptr_t *lpp; /* address pointer for left block */ + xfs_alloc_ptr_t *rpp; /* address pointer for right block */ + + lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) + return error; +#endif + *rkp = *lkp; /* INT_: copy */ + *rpp = *lpp; /* INT_: copy */ + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); + } else { + xfs_alloc_rec_t *lrp; /* record pointer for left block */ + xfs_alloc_rec_t *rrp; /* record pointer for right block */ + + lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + *rrp = *lrp; + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + rkp = &key; + xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); + } + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_alloc_increment(tcur, level, &i)) || + (error = xfs_alloc_updkey(tcur, rkp, level + 1))) + goto error0; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + *stat = 1; + return 0; +error0: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and its first record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_alloc_split( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to split */ + xfs_agblock_t *bnop, /* output: block number allocated */ + xfs_alloc_key_t *keyp, /* output: first key of new block */ + xfs_btree_cur_t **curp, /* output: new cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* loop index/record number */ + xfs_agblock_t lbno; /* left (current) block number */ + xfs_buf_t *lbp; /* buffer for left block */ + xfs_alloc_block_t *left; /* left (current) btree block */ + xfs_agblock_t rbno; /* right (new) block number */ + xfs_buf_t *rbp; /* buffer for right block */ + xfs_alloc_block_t *right; /* right (new) btree block */ + + /* + * Allocate the new block from the freelist. + * If we can't do it, we're toast. Give up. + */ + if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &rbno))) + return error; + if (rbno == NULLAGBLOCK) { + *stat = 0; + return 0; + } + xfs_trans_agbtree_delta(cur->bc_tp, 1); + rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno, + rbno, 0); + /* + * Set up the new block as "right". + */ + right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + /* + * "Left" is the current (according to the cursor) block. + */ + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_ALLOC_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * Fill in the btree header for the new block. + */ + INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + right->bb_level = left->bb_level; /* INT_: direct copy */ + INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2)); + /* + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) && + cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1) + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1; + /* + * For non-leaf blocks, copy keys and addresses over to the new block. + */ + if (level > 0) { + xfs_alloc_key_t *lkp; /* left btree key pointer */ + xfs_alloc_ptr_t *lpp; /* left btree address pointer */ + xfs_alloc_key_t *rkp; /* right btree key pointer */ + xfs_alloc_ptr_t *rpp; /* right btree address pointer */ + + lkp = XFS_ALLOC_KEY_ADDR(left, i, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, i, cur); + rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); + rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */ + memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); /* INT_: copy */ + xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + *keyp = *rkp; + } + /* + * For leaf blocks, copy records over to the new block. + */ + else { + xfs_alloc_rec_t *lrp; /* left btree record pointer */ + xfs_alloc_rec_t *rrp; /* right btree record pointer */ + + lrp = XFS_ALLOC_REC_ADDR(left, i, cur); + rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); + memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */ + keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */ + } + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp)); + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT))); + right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */ + INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno); + INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno); + xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS); + xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_alloc_block_t *rrblock; /* rr btree block */ + xfs_buf_t *rrbp; /* buffer for rrblock */ + + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, + &rrbp, XFS_ALLOC_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno); + xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * If there are more levels, we'll need another cursor which refers to + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + if ((error = xfs_btree_dup_cursor(cur, curp))) + return error; + (*curp)->bc_ptrs[level + 1]++; + } + *bnop = rbno; + *stat = 1; + return 0; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int /* error */ +xfs_alloc_updkey( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_alloc_key_t *keyp, /* new key value to update to */ + int level) /* starting level for update */ +{ + int ptr; /* index of key in block */ + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + xfs_alloc_block_t *block; /* btree block */ + xfs_buf_t *bp; /* buffer for block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + xfs_alloc_key_t *kp; /* ptr to btree block keys */ + + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); + *kp = *keyp; + xfs_alloc_log_keys(cur, bp, ptr, ptr); + } + return 0; +} + +/* + * Externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_decrement( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ + int error; /* error return value */ + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the left at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + /* + * Decrement the ptr at this level. If we're still in the block + * then we're done. + */ + if (--cur->bc_ptrs[level] > 0) { + *stat = 1; + return 0; + } + /* + * Get a pointer to the btree block. + */ + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, + cur->bc_bufs[level]))) + return error; +#endif + /* + * If we just went off the left edge of the tree, return failure. + */ + if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* + * Read-ahead the left block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + xfs_buf_t *bp; /* buffer pointer for block */ + + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, agbno, 0, &bp, + XFS_ALLOC_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT); + } + *stat = 1; + return 0; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_alloc_delete( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* result code */ + int level; /* btree level */ + + /* + * Go up the tree, starting at leaf level. + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + if ((error = xfs_alloc_delrec(cur, level, &i))) + return error; + } + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + if ((error = xfs_alloc_decrement(cur, level, &i))) + return error; + break; + } + } + } + *stat = i; + return 0; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + int ptr; /* record number */ + + ptr = cur->bc_ptrs[0]; + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) + return error; +#endif + /* + * Off the right end or left end, return failure. + */ + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) { + *stat = 0; + return 0; + } + /* + * Point to the record and extract its data. + */ + { + xfs_alloc_rec_t *rec; /* record data */ + + rec = XFS_ALLOC_REC_ADDR(block, ptr, cur); + *bno = INT_GET(rec->ar_startblock, ARCH_CONVERT); + *len = INT_GET(rec->ar_blockcount, ARCH_CONVERT); + } + *stat = 1; + return 0; +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_increment( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_alloc_block_t *block; /* btree block */ + xfs_buf_t *bp; /* tree block buffer */ + int error; /* error return value */ + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the right at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + /* + * Get a pointer to the btree block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Increment the ptr at this level. If we're still in the block + * then we're done. + */ + if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + *stat = 1; + return 0; + } + /* + * If we just went off the right edge of the tree, return failure. + */ + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + bp = cur->bc_bufs[lev]; + block = XFS_BUF_TO_ALLOC_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; +#endif + if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) + break; + /* + * Read-ahead the right block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp); + lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + + agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agno, agbno, 0, &bp, + XFS_ALLOC_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_ALLOC_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = 1; + } + *stat = 1; + return 0; +} + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_alloc_insert( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + xfs_agblock_t nbno; /* new block number (split result) */ + xfs_btree_cur_t *ncur; /* new cursor (split result) */ + xfs_alloc_rec_t nrec; /* record being inserted this level */ + xfs_btree_cur_t *pcur; /* previous level's cursor */ + + level = 0; + nbno = NULLAGBLOCK; + INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock); + INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount); + ncur = (xfs_btree_cur_t *)0; + pcur = cur; + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nbno into this level of the tree. + * Note if we fail, nbno will be null. + */ + if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur, + &i))) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + return error; + } + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* + * If we got a new cursor, switch to it. + */ + if (ncur) { + pcur = ncur; + ncur = (xfs_btree_cur_t *)0; + } + } while (nbno != NULLAGBLOCK); + *stat = i; + return 0; +} + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_eq( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur, to the value given by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_alloc_update( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_alloc_block_t *block; /* btree block to update */ + int error; /* error return value */ + int ptr; /* current record number (updating) */ + + ASSERT(len > 0); + /* + * Pick up the a.g. freelist struct and the current block. + */ + block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) + return error; +#endif + /* + * Get the address of the rec to be updated. + */ + ptr = cur->bc_ptrs[0]; + { + xfs_alloc_rec_t *rp; /* pointer to updated record */ + + rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); + /* + * Fill in the new contents and log them. + */ + INT_SET(rp->ar_startblock, ARCH_CONVERT, bno); + INT_SET(rp->ar_blockcount, ARCH_CONVERT, len); + xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr); + } + /* + * If it's the by-size btree and it's the last leaf block and + * it's the last record... then update the size of the longest + * extent in the a.g., which we cache in the a.g. freelist header. + */ + if (cur->bc_btnum == XFS_BTNUM_CNT && + INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK && + ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + xfs_agf_t *agf; /* a.g. freespace header */ + xfs_agnumber_t seqno; + + agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT); + cur->bc_mp->m_perag[seqno].pagf_longest = len; + INT_SET(agf->agf_longest, ARCH_CONVERT, len); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, + XFS_AGF_LONGEST); + } + /* + * Updating first record in leaf. Pass new key value up to our parent. + */ + if (ptr == 1) { + xfs_alloc_key_t key; /* key containing [bno, len] */ + + INT_SET(key.ar_startblock, ARCH_CONVERT, bno); + INT_SET(key.ar_blockcount, ARCH_CONVERT, len); + if ((error = xfs_alloc_updkey(cur, &key, 1))) + return error; + } + return 0; +} diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h new file mode 100644 index 00000000000..ed5161a572e --- /dev/null +++ b/fs/xfs/xfs_alloc_btree.h @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ALLOC_BTREE_H__ +#define __XFS_ALLOC_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_btree_sblock; +struct xfs_mount; + +/* + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec +{ + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef xfs_agblock_t xfs_alloc_ptr_t; /* btree pointer type */ + /* btree block header type */ +typedef struct xfs_btree_sblock xfs_alloc_block_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK) +xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp); +#define XFS_BUF_TO_ALLOC_BLOCK(bp) xfs_buf_to_alloc_block(bp) +#else +#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp))) +#endif + +/* + * Real block structures have a size equal to the disk block size. + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE) +int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_SIZE(lev,cur) xfs_alloc_block_size(lev,cur) +#else +#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS) +int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) xfs_alloc_block_maxrecs(lev,cur) +#else +#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) \ + ((cur)->bc_mp->m_alloc_mxr[lev != 0]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS) +int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) xfs_alloc_block_minrecs(lev,cur) +#else +#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) \ + ((cur)->bc_mp->m_alloc_mnr[lev != 0]) +#endif + +/* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK) +xfs_agblock_t xfs_bno_block(struct xfs_mount *mp); +#define XFS_BNO_BLOCK(mp) xfs_bno_block(mp) +#else +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK) +xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp); +#define XFS_CNT_BLOCK(mp) xfs_cnt_block(mp) +#else +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) +#endif + +/* + * Record, key, and pointer address macros for btree blocks. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR) +xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_REC_ADDR(bb,i,cur) xfs_alloc_rec_addr(bb,i,cur) +#else +#define XFS_ALLOC_REC_ADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(0, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR) +xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_KEY_ADDR(bb,i,cur) xfs_alloc_key_addr(bb,i,cur) +#else +#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(1, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR) +xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, + struct xfs_btree_cur *cur); +#define XFS_ALLOC_PTR_ADDR(bb,i,cur) xfs_alloc_ptr_addr(bb,i,cur) +#else +#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \ + XFS_ALLOC_BLOCK_MAXRECS(1, cur)) +#endif + +/* + * Prototypes for externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_decrement( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_alloc_delete( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_alloc_increment( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_alloc_insert( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +/* + * Update the record referred to by cur, to the value given by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len); /* length of extent */ + +#endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h new file mode 100644 index 00000000000..ae35189b3d7 --- /dev/null +++ b/fs/xfs/xfs_arch.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ARCH_H__ +#define __XFS_ARCH_H__ + +#ifndef XFS_BIG_INUMS +# error XFS_BIG_INUMS must be defined true or false +#endif + +#ifdef __KERNEL__ + +#include + +#ifdef __LITTLE_ENDIAN +# define __BYTE_ORDER __LITTLE_ENDIAN +#endif +#ifdef __BIG_ENDIAN +# define __BYTE_ORDER __BIG_ENDIAN +#endif + +#endif /* __KERNEL__ */ + +/* do we need conversion? */ + +#define ARCH_NOCONVERT 1 +#if __BYTE_ORDER == __LITTLE_ENDIAN +# define ARCH_CONVERT 0 +#else +# define ARCH_CONVERT ARCH_NOCONVERT +#endif + +/* generic swapping macros */ + +#ifndef HAVE_SWABMACROS +#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var)))) +#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var)))) +#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var)))) +#endif + +#define INT_SWAP(type, var) \ + ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \ + ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \ + ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \ + (var)))) + +/* + * get and set integers from potentially unaligned locations + */ + +#define INT_GET_UNALIGNED_16_BE(pointer) \ + ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1]))) +#define INT_SET_UNALIGNED_16_BE(pointer,value) \ + { \ + ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \ + ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ + } + +/* define generic INT_ macros */ + +#define INT_GET(reference,arch) \ + (((arch) == ARCH_NOCONVERT) \ + ? \ + (reference) \ + : \ + INT_SWAP((reference),(reference)) \ + ) + +/* does not return a value */ +#define INT_SET(reference,arch,valueref) \ + (__builtin_constant_p(valueref) ? \ + (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \ + (void)( \ + ((reference) = (valueref)), \ + ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \ + ) \ + ) + +/* does not return a value */ +#define INT_MOD_EXPR(reference,arch,code) \ + (((arch) == ARCH_NOCONVERT) \ + ? \ + (void)((reference) code) \ + : \ + (void)( \ + (reference) = INT_GET((reference),arch) , \ + ((reference) code), \ + INT_SET(reference, arch, reference) \ + ) \ + ) + +/* does not return a value */ +#define INT_MOD(reference,arch,delta) \ + (void)( \ + INT_MOD_EXPR(reference,arch,+=(delta)) \ + ) + +/* + * INT_COPY - copy a value between two locations with the + * _same architecture_ but _potentially different sizes_ + * + * if the types of the two parameters are equal or they are + * in native architecture, a simple copy is done + * + * otherwise, architecture conversions are done + * + */ + +/* does not return a value */ +#define INT_COPY(dst,src,arch) \ + ( \ + ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \ + ? \ + (void)((dst) = (src)) \ + : \ + INT_SET(dst, arch, INT_GET(src, arch)) \ + ) + +/* + * INT_XLATE - copy a value in either direction between two locations + * with different architectures + * + * dir < 0 - copy from memory to buffer (native to arch) + * dir > 0 - copy from buffer to memory (arch to native) + */ + +/* does not return a value */ +#define INT_XLATE(buf,mem,dir,arch) {\ + ASSERT(dir); \ + if (dir>0) { \ + (mem)=INT_GET(buf, arch); \ + } else { \ + INT_SET(buf, arch, mem); \ + } \ +} + +/* + * In directories inode numbers are stored as unaligned arrays of unsigned + * 8bit integers on disk. + * + * For v1 directories or v2 directories that contain inode numbers that + * do not fit into 32bit the array has eight members, but the first member + * is always zero: + * + * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7| + * + * For v2 directories that only contain entries with inode numbers that fit + * into 32bits a four-member array is used: + * + * |24-31|16-23| 8-15| 0- 7| + */ + +#define XFS_GET_DIR_INO4(di) \ + (((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) + +#define XFS_PUT_DIR_INO4(from, di) \ +do { \ + (di).i[0] = (((from) & 0xff000000ULL) >> 24); \ + (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \ + (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \ + (di).i[3] = ((from) & 0x000000ffULL); \ +} while (0) + +#define XFS_DI_HI(di) \ + (((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) +#define XFS_DI_LO(di) \ + (((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) + +#define XFS_GET_DIR_INO8(di) \ + (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ + ((xfs_ino_t)XFS_DI_HI(di) << 32)) + +#define XFS_PUT_DIR_INO8(from, di) \ +do { \ + (di).i[0] = 0; \ + (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \ + (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \ + (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \ + (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \ + (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \ + (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \ + (di).i[7] = ((from) & 0x00000000000000ffULL); \ +} while (0) + +#endif /* __XFS_ARCH_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c new file mode 100644 index 00000000000..ee8b5904ec7 --- /dev/null +++ b/fs/xfs/xfs_attr.c @@ -0,0 +1,2660 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_quota.h" +#include "xfs_rw.h" +#include "xfs_trans_space.h" +#include "xfs_acl.h" + +/* + * xfs_attr.c + * + * Provide the external interfaces to manage attribute lists. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute list fits inside the inode. + */ +STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is one block. + */ +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); + +/* + * Internal routines when attribute list is more than one block. + */ +STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); +STATIC int xfs_attr_fillstate(xfs_da_state_t *state); +STATIC int xfs_attr_refillstate(xfs_da_state_t *state); + +/* + * Routines to manipulate out-of-line attribute values. + */ +STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args); +STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); +STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +#if defined(XFS_ATTR_TRACE) +ktrace_t *xfs_attr_trace_buf; +#endif + + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +int +xfs_attr_fetch(xfs_inode_t *ip, char *name, int namelen, + char *value, int *valuelenp, int flags, struct cred *cred) +{ + xfs_da_args_t args; + int error; + + if ((XFS_IFORK_Q(ip) == 0) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return(ENOATTR); + + if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) { + if ((error = xfs_iaccess(ip, S_IRUSR, cred))) + return(XFS_ERROR(error)); + } + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.value = value; + args.valuelen = *valuelenp; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = ip; + args.whichfork = XFS_ATTR_FORK; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (XFS_IFORK_Q(ip) == 0 || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) { + error = XFS_ERROR(ENOATTR); + } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_getvalue(&args); + } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_get(&args); + } else { + error = xfs_attr_node_get(&args); + } + + /* + * Return the number of bytes in the value to the caller. + */ + *valuelenp = args.valuelen; + + if (error == EEXIST) + error = 0; + return(error); +} + +int +xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp, + int flags, struct cred *cred) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + int error, namelen; + + XFS_STATS_INC(xs_attr_get); + + if (!name) + return(EINVAL); + namelen = strlen(name); + if (namelen >= MAXNAMELEN) + return(EFAULT); /* match IRIX behaviour */ + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return(error); +} + +/*ARGSUSED*/ +int /* error */ +xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags, + struct cred *cred) +{ + xfs_da_args_t args; + xfs_inode_t *dp; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error, err2, committed; + int local, size; + uint nblks; + xfs_mount_t *mp; + int rsvd = (flags & ATTR_ROOT) != 0; + int namelen; + + namelen = strlen(name); + if (namelen >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + + XFS_STATS_INC(xs_attr_set); + + dp = XFS_BHVTOI(bdp); + mp = dp->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return (EIO); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!(flags & ATTR_SECURE) && + (error = xfs_iaccess(dp, S_IWUSR, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Attach the dquots to the inode. + */ + if ((error = XFS_QM_DQATTACH(mp, dp, 0))) + return (error); + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + error = xfs_bmap_add_attrfork(dp, rsvd); + if (error) + return(error); + } + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.value = value; + args.valuelen = valuelen; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.whichfork = XFS_ATTR_FORK; + args.oknoent = 1; + + /* Determine space new attribute will use, and if it will be inline + * or out of line. + */ + size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local); + + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (local) { + if (size > (mp->m_sb.sb_blocksize >> 1)) { + /* Double split possible */ + nblks <<= 1; + } + } else { + uint dblocks = XFS_B_TO_FSB(mp, valuelen); + /* Out of line attribute, cannot double split, but make + * room for the attribute value itself. + */ + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + /* Size is now blocks for attribute data */ + args.total = nblks; + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (rsvd) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, (uint) nblks, + XFS_ATTRSET_LOG_RES(mp, nblks), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRSET_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + return (error); + } + + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + + /* + * If the attribute list is non-existant or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && + (dp->i_d.di_anextents == 0))) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + (void)xfs_attr_shortform_create(&args); + + /* + * Try to add the attr to the attribute list in + * the inode. + */ + error = xfs_attr_shortform_addname(&args); + if (error != ENOSPC) { + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + ASSERT(args.trans != NULL); + + /* + * If this is a synchronous mount, make sure that + * the transaction goes to disk before returning + * to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + err2 = xfs_trans_commit(args.trans, + XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + return(error == 0 ? err2 : error); + } + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + XFS_BMAP_INIT(args.flist, args.firstblock); + error = xfs_attr_shortform_to_leaf(&args); + if (!error) { + error = xfs_bmap_finish(&args.trans, args.flist, + *args.firstblock, &committed); + } + if (error) { + ASSERT(committed); + args.trans = NULL; + xfs_bmap_cancel(&flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + } + + /* + * Commit the leaf transformation. We'll need another (linked) + * transaction to add the new attribute to the leaf. + */ + if ((error = xfs_attr_rolltrans(&args.trans, dp))) + goto out; + + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_addname(&args); + } else { + error = xfs_attr_node_addname(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ +/*ARGSUSED*/ +int /* error */ +xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred) +{ + xfs_da_args_t args; + xfs_inode_t *dp; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error; + xfs_mount_t *mp; + int namelen; + + ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */ + namelen = strlen(name); + if (namelen>=MAXNAMELEN) + return EFAULT; /* match irix behaviour */ + + XFS_STATS_INC(xs_attr_remove); + + dp = XFS_BHVTOI(bdp); + mp = dp->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return (EIO); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!(flags & ATTR_SECURE) && + (error = xfs_iaccess(dp, S_IWUSR, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } else if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(ENOATTR)); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Fill in the arg structure for this request. + */ + memset((char *)&args, 0, sizeof(args)); + args.name = name; + args.namelen = namelen; + args.flags = flags; + args.hashval = xfs_da_hashname(args.name, args.namelen); + args.dp = dp; + args.firstblock = &firstblock; + args.flist = &flist; + args.total = 0; + args.whichfork = XFS_ATTR_FORK; + + /* + * Attach the dquots to the inode. + */ + if ((error = XFS_QM_DQATTACH(mp, dp, 0))) + return (error); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (flags & ATTR_ROOT) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + if ((error = xfs_trans_reserve(args.trans, + XFS_ATTRRM_SPACE_RES(mp), + XFS_ATTRRM_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRRM_LOG_COUNT))) { + xfs_trans_cancel(args.trans, 0); + return(error); + + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + /* + * No need to make quota reservations here. We expect to release some + * blocks not allocate in the common case. + */ + xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args.trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = XFS_ERROR(ENOATTR); + goto out; + } + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(&args); + if (error) { + goto out; + } + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(&args); + } else { + error = xfs_attr_node_removename(&args); + } + if (error) { + goto out; + } + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(args.trans); + } + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + /* + * Hit the inode change time. + */ + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_ichgtime(dp, XFS_ICHGTIME_CHG); + } + + return(error); + +out: + if (args.trans) + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, + attrlist_cursor_kern_t *cursor, struct cred *cred) +{ + xfs_attr_list_context_t context; + xfs_inode_t *dp; + int error; + + XFS_STATS_INC(xs_attr_list); + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return(XFS_ERROR(EINVAL)); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return(XFS_ERROR(EFAULT)); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + context.dp = dp = XFS_BHVTOI(bdp); + context.cursor = cursor; + context.count = 0; + context.dupcnt = 0; + context.resynch = 1; + context.flags = flags; + if (!(flags & ATTR_KERNAMELS)) { + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.alist = (attrlist_t *)buffer; + context.alist->al_count = 0; + context.alist->al_more = 0; + context.alist->al_offset[0] = context.bufsize; + } + else { + context.bufsize = bufsize; + context.firstu = context.bufsize; + context.alist = (attrlist_t *)buffer; + } + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return (EIO); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!(flags & ATTR_SECURE) && + (error = xfs_iaccess(dp, S_IRUSR, cred))) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(XFS_ERROR(error)); + } + + /* + * Decide on what work routines to call based on the inode size. + */ + xfs_attr_trace_l_c("syscall start", &context); + if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(&context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(&context); + } else { + error = xfs_attr_node_list(&context); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_attr_trace_l_c("syscall end", &context); + + if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) { + ASSERT(error >= 0); + } + else { /* must return negated buffer size or the error */ + if (context.count < 0) + error = XFS_ERROR(ERANGE); + else + error = -context.count; + } + + return(error); +} + +int /* error */ +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if ((XFS_IFORK_Q(dp) == 0) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return(0); + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ATTRINVAL_LOG_COUNT))) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + + /* + * Decide on what work routines to call based on the inode size. + */ + if ((XFS_IFORK_Q(dp) == 0) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = 0; + goto out; + } + error = xfs_attr_root_inactive(&trans, dp); + if (error) + goto out; + /* + * signal synchronous inactive transactions unless this + * is a synchronous mount filesystem in which case we + * know that we're here because we've been called out of + * xfs_inactive which means that the last reference is gone + * and the unlink transaction has already hit the disk so + * async inactive transactions are safe. + */ + if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, + (!(mp->m_flags & XFS_MOUNT_WSYNC) + ? 1 : 0)))) + goto out; + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES, + NULL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} + + + +/*======================================================================== + * External routines when attribute list is inside the inode + *========================================================================*/ + +/* + * Add a name to the shortform attribute list structure + * This is the external routine. + */ +STATIC int +xfs_attr_shortform_addname(xfs_da_args_t *args) +{ + int newsize, retval; + + retval = xfs_attr_shortform_lookup(args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + return(retval); + retval = xfs_attr_shortform_remove(args); + ASSERT(retval == 0); + } + + newsize = XFS_ATTR_SF_TOTSIZE(args->dp); + newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + if ((newsize <= XFS_IFORK_ASIZE(args->dp)) && + (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) && + (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) { + retval = xfs_attr_shortform_add(args); + ASSERT(retval == 0); + } else { + return(XFS_ERROR(ENOSPC)); + } + return(0); +} + + +/*======================================================================== + * External routines when attribute list is one block + *========================================================================*/ + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +int +xfs_attr_leaf_addname(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed; + + /* + * Read the (only) block in the attribute list in. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + /* + * Look up the given attribute in the leaf block. Figure out if + * the given flags produce an error or call for an atomic rename. + */ + retval = xfs_attr_leaf_lookup_int(bp, args); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + xfs_da_brelse(args->trans, bp); + return(retval); + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) { /* pure create op */ + xfs_da_brelse(args->trans, bp); + return(retval); + } + args->rename = 1; /* an atomic rename */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + } + + /* + * Add the attribute to the leaf block, transitioning to a Btree + * if required. + */ + retval = xfs_attr_leaf_add(bp, args); + xfs_da_buf_done(bp); + if (retval == ENOSPC) { + /* + * Promote the attribute list to the Btree format, then + * Commit that transaction so that the node_addname() call + * can manage its own transactions. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the current trans (including the inode) and start + * a new one. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + + /* + * Fob the whole rest of the problem off on the Btree code. + */ + error = xfs_attr_node_addname(args); + return(error); + } + + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->rename) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + return(error); + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Read in the block containing the "old" attr, then + * remove the "old" attr from that block (neat, huh!) + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_buf_done(bp); + + /* + * Commit the remove and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, dp); + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + } + return(error); +} + +/* + * Remove a name from the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_removename(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int committed; + int error; + + /* + * Remove the attribute. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + + ASSERT(bp != NULL); + error = xfs_attr_leaf_lookup_int(bp, args); + if (error == ENOATTR) { + xfs_da_brelse(args->trans, bp); + return(error); + } + + (void)xfs_attr_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_buf_done(bp); + return(0); +} + +/* + * Look up a name in a leaf attribute list structure. + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +int +xfs_attr_leaf_get(xfs_da_args_t *args) +{ + xfs_dabuf_t *bp; + int error; + + args->blkno = 0; + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + error = xfs_attr_leaf_lookup_int(bp, args); + if (error != EEXIST) { + xfs_da_brelse(args->trans, bp); + return(error); + } + error = xfs_attr_leaf_getvalue(bp, args); + xfs_da_brelse(args->trans, bp); + if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { + error = xfs_attr_rmtval_get(args); + } + return(error); +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + xfs_attr_leafblock_t *leaf; + int error; + xfs_dabuf_t *bp; + + context->cursor->blkno = 0; + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + leaf = bp->data; + if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + != XFS_ATTR_LEAF_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, + context->dp->i_mount, leaf); + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + + (void)xfs_attr_leaf_list_int(bp, context); + xfs_da_brelse(NULL, bp); + return(0); +} + + +/*======================================================================== + * External routines when attribute list size > XFS_LBSIZE(mp). + *========================================================================*/ + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_mount_t *mp; + int committed, retval, error; + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + dp = args->dp; + mp = dp->i_mount; +restart: + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { + goto out; + } else if (retval == EEXIST) { + if (args->flags & ATTR_CREATE) + goto out; + args->rename = 1; /* atomic rename op */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + } + + retval = xfs_attr_leaf_add(blk->bp, state->args); + if (retval == ENOSPC) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. + */ + xfs_da_state_free(state); + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the node conversion and start the next + * trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + goto restart; + } + + /* + * Split as many Btree elements as required. + * This code tracks the new and old attr's location + * in the index/blkno/rmtblkno/rmtblkcnt fields and + * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_split(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else { + /* + * Addition succeeded, update Btree hashvals. + */ + xfs_da_fixhashpath(state, &state->path); + } + + /* + * Kill the state structure, we're done with it and need to + * allow the buffers to come back later. + */ + xfs_da_state_free(state); + state = NULL; + + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return(error); + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->rename) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr_leaf_flipflags(args); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return(error); + } + + /* + * Re-find the "old" attribute entry after any split ops. + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ + args->flags |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + state->inleaf = 0; + error = xfs_da_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } + + /* + * Commit and start the next trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr_leaf_clearflag(args); + if (error) + goto out; + } + retval = error = 0; + +out: + if (state) + xfs_da_state_free(state); + if (error) + return(error); + return(retval); +} + +/* + * Remove a name from a B-tree attribute list. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_attr_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval, error, committed; + + /* + * Tie a string around our finger to remind us where we are. + */ + dp = args->dp; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error || (retval != EEXIST)) { + if (error == 0) + error = retval; + goto out; + } + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a deadlock. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if (args->rmtblkno > 0) { + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + goto out; + + /* + * Mark the attribute as INCOMPLETE, then bunmapi() the + * remote value. + */ + error = xfs_attr_leaf_setflag(args); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr_leaf_remove(blk->bp, args); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_da_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + /* + * Commit the Btree join operation and start a new trans. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + goto out; + } + + /* + * If the result is small enough, push it all into the inode. + */ + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + xfs_da_buf_done(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(INT_GET(((xfs_attr_leafblock_t *) + bp->data)->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + if (xfs_attr_shortform_allfit(bp, dp)) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_attr_leaf_to_shortform(bp, args); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + *args->firstblock, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + } else + xfs_da_brelse(args->trans, bp); + } + error = 0; + +out: + xfs_da_state_free(state); + return(error); +} + +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commit's has released these buffers. + */ +STATIC int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_da_blkno(blk->bp); + xfs_da_buf_done(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return(0); +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commit's has released those + * buffers from our grip. + */ +STATIC int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da_read_buf(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return(error); + } else { + blk->bp = NULL; + } + } + + return(0); +} + +/* + * Look up a filename in a node attribute list. + * + * This routine gets called for any attribute fork that has more than one + * block, ie: both true Btree attr lists and for single-leaf-blocks with + * "remote" values taking up more blocks. + */ +int +xfs_attr_node_get(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int error, retval; + int i; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_attr_node_ents; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } else if (retval == EEXIST) { + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Get the value, local or "remote" + */ + retval = xfs_attr_leaf_getvalue(blk->bp, args); + if (!retval && (args->rmtblkno > 0) + && !(args->flags & ATTR_KERNOVAL)) { + retval = xfs_attr_rmtval_get(args); + } + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +STATIC int /* error */ +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int error, i; + xfs_dabuf_t *bp; + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + node = bp->data; + switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) { + case XFS_DA_NODE_MAGIC: + xfs_attr_trace_l_cn("wrong blk", context, node); + xfs_da_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + leaf = bp->data; + if (cursor->hashval > + INT_GET(leaf->entries[ + INT_GET(leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT)) { + xfs_attr_trace_l_cl("wrong blk", + context, leaf); + xfs_da_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= + INT_GET(leaf->entries[0].hashval, + ARCH_CONVERT)) { + xfs_attr_trace_l_cl("maybe wrong blk", + context, leaf); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + break; + default: + xfs_attr_trace_l_c("wrong blk - ??", context); + xfs_da_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + error = xfs_da_read_buf(NULL, context->dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (unlikely(bp == NULL)) { + XFS_ERROR_REPORT("xfs_attr_node_list(2)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount); + return(XFS_ERROR(EFSCORRUPTED)); + } + node = bp->data; + if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) + break; + if (unlikely(INT_GET(node->hdr.info.magic, ARCH_CONVERT) + != XFS_DA_NODE_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + btree = node->btree; + for (i = 0; + i < INT_GET(node->hdr.count, ARCH_CONVERT); + btree++, i++) { + if (cursor->hashval + <= INT_GET(btree->hashval, + ARCH_CONVERT)) { + cursor->blkno = INT_GET(btree->before, ARCH_CONVERT); + xfs_attr_trace_l_cb("descending", + context, btree); + break; + } + } + if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) { + xfs_da_brelse(NULL, bp); + return(0); + } + xfs_da_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->data; + if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + != XFS_ATTR_LEAF_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, leaf); + xfs_da_brelse(NULL, bp); + return(XFS_ERROR(EFSCORRUPTED)); + } + error = xfs_attr_leaf_list_int(bp, context); + if (error || !leaf->hdr.info.forw) + break; /* not really an error, buffer full or EOF */ + cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT); + xfs_da_brelse(NULL, bp); + error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + if (unlikely((bp == NULL))) { + XFS_ERROR_REPORT("xfs_attr_node_list(5)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount); + return(XFS_ERROR(EFSCORRUPTED)); + } + } + xfs_da_brelse(NULL, bp); + return(0); +} + + +/*======================================================================== + * External routines for manipulating out-of-line attribute values. + *========================================================================*/ + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +STATIC int +xfs_attr_rmtval_get(xfs_da_args_t *args) +{ + xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; + xfs_mount_t *mp; + xfs_daddr_t dblkno; + xfs_caddr_t dst; + xfs_buf_t *bp; + int nmap, error, tmp, valuelen, blkcnt, i; + xfs_dablk_t lblkno; + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + mp = args->dp->i_mount; + dst = args->value; + valuelen = args->valuelen; + lblkno = args->rmtblkno; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, map, &nmap, NULL); + if (error) + return(error); + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, + blkcnt, XFS_BUF_LOCK, &bp); + if (error) + return(error); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) + ? valuelen : XFS_BUF_SIZE(bp); + xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); + xfs_buf_relse(bp); + dst += tmp; + valuelen -= tmp; + + lblkno += map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +STATIC int +xfs_attr_rmtval_set(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_fileoff_t lfileoff; + xfs_inode_t *dp; + xfs_bmbt_irec_t map; + xfs_daddr_t dblkno; + xfs_caddr_t src; + xfs_buf_t *bp; + xfs_dablk_t lblkno; + int blkcnt, valuelen, nmap, error, tmp, committed; + + dp = args->dp; + mp = dp->i_mount; + src = args->value; + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. + */ + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + lfileoff = 0; + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) { + return(error); + } + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + /* + * Allocate a single extent, up to the size of the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | + XFS_BMAPI_WRITE, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, dp); + } + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, dp))) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + valuelen = args->valuelen; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, + blkcnt, XFS_BUF_LOCK); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + + tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : + XFS_BUF_SIZE(bp); + xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); + if (tmp < XFS_BUF_SIZE(bp)) + xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ + return (error); + } + src += tmp; + valuelen -= tmp; + + lblkno += map.br_blockcount; + } + ASSERT(valuelen == 0); + return(0); +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +STATIC int +xfs_attr_rmtval_remove(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_bmbt_irec_t map; + xfs_buf_t *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + int valuelen, blkcnt, nmap, error, done, committed; + + mp = args->dp->i_mount; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + lblkno = args->rmtblkno; + valuelen = args->rmtblkcnt; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + XFS_BMAP_INIT(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, 0, &map, &nmap, + args->flist); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, + XFS_INCORE_TRYLOCK); + if (bp) { + XFS_BUF_STALE(bp); + XFS_BUF_UNDELAYWRITE(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + valuelen -= map.br_blockcount; + + lblkno += map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + XFS_BMAP_INIT(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + *args->firstblock, &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) { + xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(args->trans, args->dp); + } + + /* + * Close out trans and start the next one in the chain. + */ + if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) + return (error); + } + return(0); +} + +#if defined(XFS_ATTR_TRACE) +/* + * Add a trace buffer entry for an attr_list context structure. + */ +void +xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + ((context->count > 0) && + !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)NULL, + (__psunsigned_t)NULL, + (__psunsigned_t)NULL); +} + +/* + * Add a trace buffer entry for a context structure and a Btree node. + */ +void +xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, + struct xfs_da_intnode *node) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + ((context->count > 0) && + !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Add a trace buffer entry for a context structure and a Btree element. + */ +void +xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, + struct xfs_da_node_entry *btree) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + ((context->count > 0) && + !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT), + (__psunsigned_t)NULL); +} + +/* + * Add a trace buffer entry for a context structure and a leaf block. + */ +void +xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, + struct xfs_attr_leafblock *leaf) +{ + xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, + (__psunsigned_t)context->dp, + (__psunsigned_t)context->cursor->hashval, + (__psunsigned_t)context->cursor->blkno, + (__psunsigned_t)context->cursor->offset, + (__psunsigned_t)context->alist, + (__psunsigned_t)context->bufsize, + (__psunsigned_t)context->count, + (__psunsigned_t)context->firstu, + (__psunsigned_t) + ((context->count > 0) && + !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) + ? (ATTR_ENTRY(context->alist, + context->count-1)->a_valuelen) + : 0, + (__psunsigned_t)context->dupcnt, + (__psunsigned_t)context->flags, + (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT), + (__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +void +xfs_attr_trace_enter(int type, char *where, + __psunsigned_t a2, __psunsigned_t a3, + __psunsigned_t a4, __psunsigned_t a5, + __psunsigned_t a6, __psunsigned_t a7, + __psunsigned_t a8, __psunsigned_t a9, + __psunsigned_t a10, __psunsigned_t a11, + __psunsigned_t a12, __psunsigned_t a13, + __psunsigned_t a14, __psunsigned_t a15) +{ + ASSERT(xfs_attr_trace_buf); + ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), + (void *)where, + (void *)a2, (void *)a3, (void *)a4, + (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10, + (void *)a11, (void *)a12, (void *)a13, + (void *)a14, (void *)a15); +} +#endif /* XFS_ATTR_TRACE */ + + +/*======================================================================== + * System (pseudo) namespace attribute interface routines. + *========================================================================*/ + +STATIC int +posix_acl_access_set( + vnode_t *vp, char *name, void *data, size_t size, int xflags) +{ + return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS); +} + +STATIC int +posix_acl_access_remove( + struct vnode *vp, char *name, int xflags) +{ + return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); +} + +STATIC int +posix_acl_access_get( + vnode_t *vp, char *name, void *data, size_t size, int xflags) +{ + return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS); +} + +STATIC int +posix_acl_access_exists( + vnode_t *vp) +{ + return xfs_acl_vhasacl_access(vp); +} + +STATIC int +posix_acl_default_set( + vnode_t *vp, char *name, void *data, size_t size, int xflags) +{ + return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT); +} + +STATIC int +posix_acl_default_get( + vnode_t *vp, char *name, void *data, size_t size, int xflags) +{ + return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT); +} + +STATIC int +posix_acl_default_remove( + struct vnode *vp, char *name, int xflags) +{ + return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT); +} + +STATIC int +posix_acl_default_exists( + vnode_t *vp) +{ + return xfs_acl_vhasacl_default(vp); +} + +struct attrnames posix_acl_access = { + .attr_name = "posix_acl_access", + .attr_namelen = sizeof("posix_acl_access") - 1, + .attr_get = posix_acl_access_get, + .attr_set = posix_acl_access_set, + .attr_remove = posix_acl_access_remove, + .attr_exists = posix_acl_access_exists, +}; + +struct attrnames posix_acl_default = { + .attr_name = "posix_acl_default", + .attr_namelen = sizeof("posix_acl_default") - 1, + .attr_get = posix_acl_default_get, + .attr_set = posix_acl_default_set, + .attr_remove = posix_acl_default_remove, + .attr_exists = posix_acl_default_exists, +}; + +struct attrnames *attr_system_names[] = + { &posix_acl_access, &posix_acl_default }; + + +/*======================================================================== + * Namespace-prefix-style attribute name interface routines. + *========================================================================*/ + +STATIC int +attr_generic_set( + struct vnode *vp, char *name, void *data, size_t size, int xflags) +{ + int error; + + VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error); + return -error; +} + +STATIC int +attr_generic_get( + struct vnode *vp, char *name, void *data, size_t size, int xflags) +{ + int error, asize = size; + + VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error); + if (!error) + return asize; + return -error; +} + +STATIC int +attr_generic_remove( + struct vnode *vp, char *name, int xflags) +{ + int error; + + VOP_ATTR_REMOVE(vp, name, xflags, NULL, error); + return -error; +} + +STATIC int +attr_generic_listadd( + attrnames_t *prefix, + attrnames_t *namesp, + void *data, + size_t size, + ssize_t *result) +{ + char *p = data + *result; + + *result += prefix->attr_namelen; + *result += namesp->attr_namelen + 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + strcpy(p, prefix->attr_name); + p += prefix->attr_namelen; + strcpy(p, namesp->attr_name); + p += namesp->attr_namelen + 1; + return 0; +} + +STATIC int +attr_system_list( + struct vnode *vp, + void *data, + size_t size, + ssize_t *result) +{ + attrnames_t *namesp; + int i, error = 0; + + for (i = 0; i < ATTR_SYSCOUNT; i++) { + namesp = attr_system_names[i]; + if (!namesp->attr_exists || !namesp->attr_exists(vp)) + continue; + error = attr_generic_listadd(&attr_system, namesp, + data, size, result); + if (error) + break; + } + return error; +} + +int +attr_generic_list( + struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result) +{ + attrlist_cursor_kern_t cursor = { 0 }; + int error; + + VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error); + if (error > 0) + return -error; + *result = -error; + return attr_system_list(vp, data, size, result); +} + +attrnames_t * +attr_lookup_namespace( + char *name, + struct attrnames **names, + int nnames) +{ + int i; + + for (i = 0; i < nnames; i++) + if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen)) + return names[i]; + return NULL; +} + +/* + * Some checks to prevent people abusing EAs to get over quota: + * - Don't allow modifying user EAs on devices/symlinks; + * - Don't allow modifying user EAs if sticky bit set; + */ +STATIC int +attr_user_capable( + struct vnode *vp, + cred_t *cred) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && + (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + return 0; +} + +STATIC int +attr_trusted_capable( + struct vnode *vp, + cred_t *cred) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + +STATIC int +attr_secure_capable( + struct vnode *vp, + cred_t *cred) +{ + return -ENOSECURITY; +} + +STATIC int +attr_system_set( + struct vnode *vp, char *name, void *data, size_t size, int xflags) +{ + attrnames_t *namesp; + int error; + + if (xflags & ATTR_CREATE) + return -EINVAL; + + namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); + if (!namesp) + return -EOPNOTSUPP; + error = namesp->attr_set(vp, name, data, size, xflags); + if (!error) + error = vn_revalidate(vp); + return error; +} + +STATIC int +attr_system_get( + struct vnode *vp, char *name, void *data, size_t size, int xflags) +{ + attrnames_t *namesp; + + namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); + if (!namesp) + return -EOPNOTSUPP; + return namesp->attr_get(vp, name, data, size, xflags); +} + +STATIC int +attr_system_remove( + struct vnode *vp, char *name, int xflags) +{ + attrnames_t *namesp; + + namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); + if (!namesp) + return -EOPNOTSUPP; + return namesp->attr_remove(vp, name, xflags); +} + +struct attrnames attr_system = { + .attr_name = "system.", + .attr_namelen = sizeof("system.") - 1, + .attr_flag = ATTR_SYSTEM, + .attr_get = attr_system_get, + .attr_set = attr_system_set, + .attr_remove = attr_system_remove, + .attr_capable = (attrcapable_t)fs_noerr, +}; + +struct attrnames attr_trusted = { + .attr_name = "trusted.", + .attr_namelen = sizeof("trusted.") - 1, + .attr_flag = ATTR_ROOT, + .attr_get = attr_generic_get, + .attr_set = attr_generic_set, + .attr_remove = attr_generic_remove, + .attr_capable = attr_trusted_capable, +}; + +struct attrnames attr_secure = { + .attr_name = "security.", + .attr_namelen = sizeof("security.") - 1, + .attr_flag = ATTR_SECURE, + .attr_get = attr_generic_get, + .attr_set = attr_generic_set, + .attr_remove = attr_generic_remove, + .attr_capable = attr_secure_capable, +}; + +struct attrnames attr_user = { + .attr_name = "user.", + .attr_namelen = sizeof("user.") - 1, + .attr_get = attr_generic_get, + .attr_set = attr_generic_set, + .attr_remove = attr_generic_remove, + .attr_capable = attr_user_capable, +}; + +struct attrnames *attr_namespaces[] = + { &attr_system, &attr_trusted, &attr_secure, &attr_user }; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h new file mode 100644 index 00000000000..67cd0f5ac1a --- /dev/null +++ b/fs/xfs/xfs_attr.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +/* + * xfs_attr.h + * + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * External interfaces + *========================================================================*/ + +struct cred; +struct vnode; + +typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int); +typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int); +typedef int (*attrremove_t)(struct vnode *, char *, int); +typedef int (*attrexists_t)(struct vnode *); +typedef int (*attrcapable_t)(struct vnode *, struct cred *); + +typedef struct attrnames { + char * attr_name; + unsigned int attr_namelen; + unsigned int attr_flag; + attrget_t attr_get; + attrset_t attr_set; + attrremove_t attr_remove; + attrexists_t attr_exists; + attrcapable_t attr_capable; +} attrnames_t; + +#define ATTR_NAMECOUNT 4 +extern struct attrnames attr_user; +extern struct attrnames attr_secure; +extern struct attrnames attr_system; +extern struct attrnames attr_trusted; +extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; + +#define ATTR_SYSCOUNT 2 +extern struct attrnames posix_acl_access; +extern struct attrnames posix_acl_default; +extern struct attrnames *attr_system_names[ATTR_SYSCOUNT]; + +extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int); +extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *); + +#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ +#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ +#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */ + +#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */ +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ +#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */ + +#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */ +#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */ +#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS) + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an ERANGE return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Multi-attribute operation vector. + */ +typedef struct attr_multiop { + int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */ + int am_error; /* [out arg] result of this sub-op (an errno) */ + char *am_attrname; /* attribute name to work with */ + char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */ + int am_length; /* [in/out arg] length of value */ + int am_flags; /* bitwise OR of attr API flags defined above */ +} attr_multiop_t; + +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +struct xfs_inode; +struct attrlist_cursor_kern; +struct xfs_da_args; + +/* + * Overall external interface routines. + */ +int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *); +int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *); +int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *); +int xfs_attr_list(bhv_desc_t *, char *, int, int, + struct attrlist_cursor_kern *, struct cred *); +int xfs_attr_inactive(struct xfs_inode *dp); + +int xfs_attr_node_get(struct xfs_da_args *); +int xfs_attr_leaf_get(struct xfs_da_args *); +int xfs_attr_shortform_getvalue(struct xfs_da_args *); +int xfs_attr_fetch(struct xfs_inode *, char *, int, + char *, int *, int, struct cred *); + +#endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c new file mode 100644 index 00000000000..b11256e58bf --- /dev/null +++ b/fs/xfs/xfs_attr_leaf.c @@ -0,0 +1,3050 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * xfs_attr_leaf.c + * + * GROT: figure out how to recover gracefully when bmap returns ENOSPC. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_bit.h" + +/* + * xfs_attr_leaf.c + * + * Routines to implement leaf blocks of attributes as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, + int freemap_index); +STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); +STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + xfs_da_state_blk_t *leaf_blk_2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); + +/* + * Utility routines. + */ +STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, + int src_start, + xfs_attr_leafblock_t *dst_leaf, + int dst_start, int move_count, + xfs_mount_t *mp); + + +/*======================================================================== + * External routines when dirsize < XFS_LITINO(mp). + *========================================================================*/ + +/* + * Create the initial contents of a shortform attribute list. + */ +int +xfs_attr_shortform_create(xfs_da_args_t *args) +{ + xfs_attr_sf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + ASSERT(dp != NULL); + ifp = dp->i_afp; + ASSERT(ifp != NULL); + ASSERT(ifp->if_bytes == 0); + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_flags |= XFS_IFINLINE; + } else { + ASSERT(ifp->if_flags & XFS_IFINLINE); + } + xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); + hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; + hdr->count = 0; + INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr)); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + return(0); +} + +/* + * Add a name/value pair to the shortform attribute list. + * Overflow from the inode has already been checked for. + */ +int +xfs_attr_shortform_add(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i, offset, size; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + dp = args->dp; + ifp = dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + return(XFS_ERROR(EEXIST)); + } + + offset = (char *)sfe - (char *)sf; + size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + + sfe->namelen = args->namelen; + INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen); + sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : + ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + memcpy(sfe->nameval, args->name, args->namelen); + memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); + INT_MOD(sf->hdr.count, ARCH_CONVERT, 1); + INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + return(0); +} + +/* + * Remove a name from the shortform attribute list structure. + */ +int +xfs_attr_shortform_remove(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int base, size=0, end, totsize, i; + xfs_inode_t *dp; + + /* + * Remove the attribute. + */ + dp = args->dp; + base = sizeof(xfs_attr_sf_hdr_t); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (sfe->namelen != args->namelen) + continue; + if (memcmp(sfe->nameval, args->name, args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + break; + } + if (i == INT_GET(sf->hdr.count, ARCH_CONVERT)) + return(XFS_ERROR(ENOATTR)); + + end = base + size; + totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT); + if (end != totsize) { + memmove(&((char *)sf)[base], &((char *)sf)[end], + totsize - end); + } + INT_MOD(sf->hdr.count, ARCH_CONVERT, -1); + INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size); + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + return(0); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_lookup(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + xfs_ifork_t *ifp; + + ifp = args->dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_getvalue(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + + ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0)) + continue; + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + return(XFS_ERROR(EEXIST)); + } + if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) { + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + return(XFS_ERROR(ERANGE)); + } + args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + memcpy(args->value, &sfe->nameval[args->namelen], + args->valuelen); + return(XFS_ERROR(EEXIST)); + } + return(XFS_ERROR(ENOATTR)); +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_da_args_t nargs; + char *tmpbuffer; + int error, i, size; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + xfs_ifork_t *ifp; + + dp = args->dp; + ifp = dp->i_afp; + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + size = INT_GET(sf->hdr.totsize, ARCH_CONVERT); + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, ifp->if_u1.if_data, size); + sf = (xfs_attr_shortform_t *)tmpbuffer; + + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + bp = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) { + /* + * If we hit an IO error middle of the transaction inside + * grow_inode(), we may have inconsistent data. Bail out. + */ + if (error == EIO) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + ASSERT(blkno == 0); + error = xfs_attr_leaf_create(args, blkno, &bp); + if (error) { + error = xfs_da_shrink_inode(args, 0, bp); + bp = NULL; + if (error) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.oknoent = 1; + + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + nargs.name = (char *)sfe->nameval; + nargs.namelen = sfe->namelen; + nargs.value = (char *)&sfe->nameval[nargs.namelen]; + nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT); + nargs.hashval = xfs_da_hashname((char *)sfe->nameval, + sfe->namelen); + nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : + ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == ENOATTR); + error = xfs_attr_leaf_add(bp, &nargs); + ASSERT(error != ENOSPC); + if (error) + goto out; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + error = 0; + +out: + if(bp) + xfs_da_buf_done(bp); + kmem_free(tmpbuffer, size); + return(error); +} + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (INT_GET(sa->hash, ARCH_CONVERT) + < INT_GET(sb->hash, ARCH_CONVERT)) { + return(-1); + } else if (INT_GET(sa->hash, ARCH_CONVERT) + > INT_GET(sb->hash, ARCH_CONVERT)) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform atrtribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + xfs_attr_trace_l_c("sf start", context); + + /* + * If the buffer is large enough, do not bother with sorting. + * Note the generous fudge factor of 16 overhead bytes per entry. + */ + if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16) + < context->bufsize) { + for (i = 0, sfe = &sf->list[0]; + i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + attrnames_t *namesp; + + if (((context->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0) && + !(context->flags & ATTR_KERNORMALS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + if (((context->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNROOTLS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure: + ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += namesp->attr_namelen + + INT_GET(sfe->namelen, ARCH_CONVERT) + 1; + } + else { + if (xfs_attr_put_listent(context, namesp, + (char *)sfe->nameval, + (int)sfe->namelen, + (int)INT_GET(sfe->valuelen, + ARCH_CONVERT))) + break; + } + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + xfs_attr_trace_l_c("sf big-gulp", context); + return(0); + } + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; + i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + xfs_attr_trace_l_c("sf corrupted", context); + kmem_free(sbuf, sbsize); + return XFS_ERROR(EFSCORRUPTED); + } + if (((context->flags & ATTR_SECURE) != 0) != + ((sfe->flags & XFS_ATTR_SECURE) != 0) && + !(context->flags & ATTR_KERNORMALS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + if (((context->flags & ATTR_ROOT) != 0) != + ((sfe->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNROOTLS)) { + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + continue; + } + sbp->entno = i; + INT_SET(sbp->hash, ARCH_CONVERT, + xfs_da_hashname((char *)sfe->nameval, sfe->namelen)); + sbp->name = (char *)sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf, sbsize); + xfs_attr_trace_l_c("blk end", context); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + attrnames_t *namesp; + + namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure : + ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); + + if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) { + cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT); + cursor->offset = 0; + } + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += namesp->attr_namelen + + sbp->namelen + 1; + } else { + if (xfs_attr_put_listent(context, namesp, + sbp->name, sbp->namelen, + INT_GET(sbp->valuelen, ARCH_CONVERT))) + break; + } + cursor->offset++; + } + + kmem_free(sbuf, sbsize); + xfs_attr_trace_l_c("sf E-O-F", context); + return(0); +} + +/* + * Check a leaf attribute block to see if all the entries would fit into + * a shortform attribute list. + */ +int +xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + int bytes, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + entry = &leaf->entries[0]; + bytes = sizeof(struct xfs_attr_sf_hdr); + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!(entry->flags & XFS_ATTR_LOCAL)) + return(0); + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX) + return(0); + bytes += sizeof(struct xfs_attr_sf_entry)-1 + + name_loc->namelen + + INT_GET(name_loc->valuelen, ARCH_CONVERT); + } + return( bytes < XFS_IFORK_ASIZE(dp) ); +} + +/* + * Convert a leaf attribute list to shortform attribute list + */ +int +xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_da_args_t nargs; + xfs_inode_t *dp; + char *tmpbuffer; + int error, i; + + dp = args->dp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + + ASSERT(bp != NULL); + memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + + /* + * Clean out the prior contents of the attribute list. + */ + error = xfs_da_shrink_inode(args, 0, bp); + if (error) + goto out; + error = xfs_attr_shortform_create(args); + if (error) + goto out; + + /* + * Copy the attributes + */ + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.oknoent = 1; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!entry->nameidx) + continue; + ASSERT(entry->flags & XFS_ATTR_LOCAL); + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + nargs.name = (char *)name_loc->nameval; + nargs.namelen = name_loc->namelen; + nargs.value = (char *)&name_loc->nameval[nargs.namelen]; + nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT); + nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT); + nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : + ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + xfs_attr_shortform_add(&nargs); + } + error = 0; + +out: + kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); + return(error); +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_attr_leaf_to_node(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_inode_t *dp; + xfs_dabuf_t *bp1, *bp2; + xfs_dablk_t blkno; + int error; + + dp = args->dp; + bp1 = bp2 = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) + goto out; + error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp1 != NULL); + bp2 = NULL; + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, + XFS_ATTR_FORK); + if (error) + goto out; + ASSERT(bp2 != NULL); + memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); + xfs_da_buf_done(bp1); + bp1 = NULL; + xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + if (error) + goto out; + node = bp1->data; + leaf = bp2->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + /* both on-disk, don't endian-flip twice */ + node->btree[0].hashval = + leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval; + INT_SET(node->btree[0].before, ARCH_CONVERT, blkno); + INT_SET(node->hdr.count, ARCH_CONVERT, 1); + xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + error = 0; +out: + if (bp1) + xfs_da_buf_done(bp1); + if (bp2) + xfs_da_buf_done(bp2); + return(error); +} + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf attribute list + * or a leaf in a node attribute list. + */ +int +xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int error; + + dp = args->dp; + ASSERT(dp != NULL); + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + leaf = bp->data; + memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); + hdr = &leaf->hdr; + INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC); + INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount)); + if (!hdr->firstused) { + INT_SET(hdr->firstused, ARCH_CONVERT, + XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); + } + + INT_SET(hdr->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_SET(hdr->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr->firstused, ARCH_CONVERT) + - INT_GET(hdr->freemap[0].base, + ARCH_CONVERT)); + + xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + + *bpp = bp; + return(0); +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_dablk_t blkno; + int error; + + /* + * Allocate space for a new leaf node. + */ + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); + error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + if (error) + return(error); + newblk->blkno = blkno; + newblk->magic = XFS_ATTR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + * NOTE: rebalance() currently depends on the 2nd block being empty. + */ + xfs_attr_leaf_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + + /* + * Save info on "old" attribute for "atomic rename" ops, leaf_add() + * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the + * "new" attrs info. Will need the "old" info to remove it later. + * + * Insert the "new" entry in the correct block. + */ + if (state->inleaf) + error = xfs_attr_leaf_add(oldblk->bp, state->args); + else + error = xfs_attr_leaf_add(newblk->bp, state->args); + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + return(error); +} + +/* + * Add a name to the leaf attribute list structure. + */ +int +xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + int tablesize, entsize, sum, tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT((args->index >= 0) + && (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT))); + hdr = &leaf->hdr; + entsize = xfs_attr_leaf_newentsize(args, + args->trans->t_mountp->m_sb.sb_blocksize, NULL); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { + if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) { + sum += INT_GET(map->size, ARCH_CONVERT); + continue; + } + if (!map->size) + continue; /* no space in this map */ + tmp = entsize; + if (INT_GET(map->base, ARCH_CONVERT) + < INT_GET(hdr->firstused, ARCH_CONVERT)) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (INT_GET(map->size, ARCH_CONVERT) >= tmp) { + tmp = xfs_attr_leaf_add_work(bp, args, i); + return(tmp); + } + sum += INT_GET(map->size, ARCH_CONVERT); + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!hdr->holes && (sum < entsize)) + return(XFS_ERROR(ENOSPC)); + + /* + * Compact the entries to coalesce free space. + * This may change the hdr->count via dropping INCOMPLETE entries. + */ + xfs_attr_leaf_compact(args->trans, bp); + + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) + < (entsize + sizeof(xfs_attr_leaf_entry_t))) + return(XFS_ERROR(ENOSPC)); + + return(xfs_attr_leaf_add_work(bp, args, 0)); +} + +/* + * Add a name to a leaf attribute list structure. + */ +STATIC int +xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_leaf_map_t *map; + xfs_mount_t *mp; + int tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); + ASSERT((args->index >= 0) + && (args->index <= INT_GET(hdr->count, ARCH_CONVERT))); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &leaf->entries[args->index]; + if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr->count, ARCH_CONVERT) - args->index; + tmp *= sizeof(xfs_attr_leaf_entry_t); + memmove((char *)(entry+1), (char *)entry, tmp); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + } + INT_MOD(hdr->count, ARCH_CONVERT, 1); + + /* + * Allocate space for the new string (at the end of the run). + */ + map = &hdr->freemap[mapindex]; + mp = args->trans->t_mountp; + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0); + ASSERT(INT_GET(map->size, ARCH_CONVERT) + >= xfs_attr_leaf_newentsize(args, + mp->m_sb.sb_blocksize, NULL)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0); + INT_MOD(map->size, ARCH_CONVERT, + -xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp)); + INT_SET(entry->nameidx, ARCH_CONVERT, + INT_GET(map->base, ARCH_CONVERT) + + INT_GET(map->size, ARCH_CONVERT)); + INT_SET(entry->hashval, ARCH_CONVERT, args->hashval); + entry->flags = tmp ? XFS_ATTR_LOCAL : 0; + entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : + ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + if (args->rename) { + entry->flags |= XFS_ATTR_INCOMPLETE; + if ((args->blkno2 == args->blkno) && + (args->index2 <= args->index)) { + args->index2++; + } + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT) + >= INT_GET((entry-1)->hashval, + ARCH_CONVERT))); + ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) || + (INT_GET(entry->hashval, ARCH_CONVERT) + <= (INT_GET((entry+1)->hashval, ARCH_CONVERT)))); + + /* + * Copy the attribute name and value into the new space. + * + * For "remote" attribute values, simply note that we need to + * allocate space for the "remote" value. We can't actually + * allocate the extents in this transaction, and we can't decide + * which blocks they should be as we might allocate more blocks + * as part of this transaction (a split operation for example). + */ + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + name_loc->namelen = args->namelen; + INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen); + memcpy((char *)name_loc->nameval, args->name, args->namelen); + memcpy((char *)&name_loc->nameval[args->namelen], args->value, + INT_GET(name_loc->valuelen, ARCH_CONVERT)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt->namelen = args->namelen; + memcpy((char *)name_rmt->name, args->name, args->namelen); + entry->flags |= XFS_ATTR_INCOMPLETE; + /* just in case */ + name_rmt->valuelen = 0; + name_rmt->valueblk = 0; + args->rmtblkno = 1; + args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + xfs_attr_leaf_entsize(leaf, args->index))); + + /* + * Update the control info for this leaf node + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) + < INT_GET(hdr->firstused, ARCH_CONVERT)) { + /* both on-disk, don't endian-flip twice */ + hdr->firstused = entry->nameidx; + } + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) + >= ((INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(*entry))+sizeof(*hdr))); + tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + if (INT_GET(map->base, ARCH_CONVERT) == tmp) { + INT_MOD(map->base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_entry_t)); + INT_MOD(map->size, ARCH_CONVERT, + -sizeof(xfs_attr_leaf_entry_t)); + } + } + INT_MOD(hdr->usedbytes, ARCH_CONVERT, + xfs_attr_leaf_entsize(leaf, args->index)); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + return(0); +} + +/* + * Garbage collect a leaf attribute list block by copying it to a new buffer. + */ +STATIC void +xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_mount_t *mp; + char *tmpbuffer; + + mp = trans->t_mountp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); + memset(bp->data, 0, XFS_LBSIZE(mp)); + + /* + * Copy basic information + */ + leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_d = bp->data; + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + hdr_d->info = hdr_s->info; /* struct copy */ + INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp)); + /* handle truncation gracefully */ + if (!hdr_d->firstused) { + INT_SET(hdr_d->firstused, ARCH_CONVERT, + XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); + } + hdr_d->usedbytes = 0; + hdr_d->count = 0; + hdr_d->holes = 0; + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr_d->firstused, ARCH_CONVERT) + - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate name/value pairs packed and in sequence. + */ + xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, + (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp); + + xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + + kmem_free(tmpbuffer, XFS_LBSIZE(mp)); +} + +/* + * Redistribute the attribute list entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of the + * old block. At present, all (one) callers pass in an empty second block. + * + * This code adjusts the args->index/blkno and args->index2/blkno2 fields + * to match what it is doing in splitting the attribute leaf block. Those + * values are used in "atomic rename" operations on attributes. Note that + * the "new" and "old" values can end up in different blocks. + */ +STATIC void +xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_args_t *args; + xfs_da_state_blk_t *tmp_blk; + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + int count, totallen, max, space, swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + args = state->args; + + /* + * Check ordering of blocks, reverse if it makes things simpler. + * + * NOTE: Given that all (current) callers pass in an empty + * second block, this code should never set "swap". + */ + swap = 0; + if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + swap = 1; + } + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + * + * "inleaf" is true if the new entry should be inserted into blk1. + * If "swap" is also true, then reverse the sense of "inleaf". + */ + state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count = INT_GET(hdr1->count, ARCH_CONVERT) - count; + space = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr2->firstused, ARCH_CONVERT) + - sizeof(xfs_attr_leaf_hdr_t); + max -= INT_GET(hdr2->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk2->bp); + } + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_attr_leaf_moveents(leaf1, + INT_GET(hdr1->count, ARCH_CONVERT)-count, + leaf2, 0, count, state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * I assert that since all callers pass in an empty + * second buffer, this code should never execute. + */ + + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count -= INT_GET(hdr1->count, ARCH_CONVERT); + space = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT); + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr1->firstused, ARCH_CONVERT) + - sizeof(xfs_attr_leaf_hdr_t); + max -= INT_GET(hdr1->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + if (space > max) { + xfs_attr_leaf_compact(args->trans, blk1->bp); + } + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_attr_leaf_moveents(leaf2, 0, leaf1, + (int)INT_GET(hdr1->count, ARCH_CONVERT), count, + state->mp); + + xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + } + + /* + * Copy out last hashval in each block for B-tree code. + */ + blk1->hashval = + INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT); + blk2->hashval = + INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT); + + /* + * Adjust the expected index for insertion. + * NOTE: this code depends on the (current) situation that the + * second block was originally empty. + * + * If the insertion point moved to the 2nd block, we must adjust + * the index. We must also track the entry just following the + * new entry for use in an "atomic rename" operation, that entry + * is always the "old" entry and the "new" entry is what we are + * inserting. The index/blkno fields refer to the "old" entry, + * while the index2/blkno2 fields refer to the "new" entry. + */ + if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) { + ASSERT(state->inleaf == 0); + blk2->index = blk1->index + - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) { + if (state->inleaf) { + args->index = blk1->index; + args->blkno = blk1->blkno; + args->index2 = 0; + args->blkno2 = blk2->blkno; + } else { + blk2->index = blk1->index + - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } + } else { + ASSERT(state->inleaf == 1); + args->index = args->index2 = blk1->index; + args->blkno = args->blkno2 = blk1->blkno; + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_attr_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2, + int *countarg, int *usedbytesarg) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_hdr_t *hdr1, *hdr2; + xfs_attr_leaf_entry_t *entry; + int count, max, index, totallen, half; + int lastdelta, foundit, tmp; + + /* + * Set up environment. + */ + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + foundit = 0; + totallen = 0; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = INT_GET(hdr1->count, ARCH_CONVERT) + + INT_GET(hdr2->count, ARCH_CONVERT); + half = (max+1) * sizeof(*entry); + half += INT_GET(hdr1->usedbytes, ARCH_CONVERT) + + INT_GET(hdr2->usedbytes, ARCH_CONVERT) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, NULL); + half /= 2; + lastdelta = state->blocksize; + entry = &leaf1->entries[0]; + for (count = index = 0; count < max; entry++, index++, count++) { + +#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, + NULL); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == INT_GET(hdr1->count, ARCH_CONVERT)) { + leaf1 = leaf2; + entry = &leaf1->entries[0]; + index = 0; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, + index); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; +#undef XFS_ATTR_ABS + } + + /* + * Calculate the number of usedbytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= count * sizeof(*entry); + if (foundit) { + totallen -= sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, + state->blocksize, + NULL); + } + + *countarg = count; + *usedbytesarg = totallen; + return(foundit); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + * + * GROT: allow for INCOMPLETE entries in calculation. + */ +int +xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_attr_leafblock_t *leaf; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, bytes, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + leaf = (xfs_attr_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = sizeof(xfs_attr_leaf_hdr_t) + + count * sizeof(xfs_attr_leaf_entry_t) + + INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + if (bytes > (state->blocksize >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return(0); + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (aribtrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = info->forw; + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink an attribute list over time. + */ + /* start with smaller blk num */ + forward = (INT_GET(info->forw, ARCH_CONVERT) + < INT_GET(info->back, ARCH_CONVERT)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = INT_GET(info->forw, ARCH_CONVERT); + else + blkno = INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + leaf = (xfs_attr_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = state->blocksize - (state->blocksize>>2); + bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + count += INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + bytes -= count * sizeof(xfs_attr_leaf_entry_t); + bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_da_brelse(state->args->trans, bp); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 1; + } + return(0); +} + +/* + * Remove a name from the leaf attribute list structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_hdr_t *hdr; + xfs_attr_leaf_map_t *map; + xfs_attr_leaf_entry_t *entry; + int before, after, smallest, entsize; + int tablesize, tmp, i; + xfs_mount_t *mp; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr = &leaf->hdr; + mp = args->trans->t_mountp; + ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) + && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8))); + ASSERT((args->index >= 0) + && (args->index < INT_GET(hdr->count, ARCH_CONVERT))); + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) + >= ((INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(*entry))+sizeof(*hdr))); + entry = &leaf->entries[args->index]; + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp)); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = INT_GET(hdr->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + map = &hdr->freemap[0]; + tmp = INT_GET(map->size, ARCH_CONVERT); + before = after = -1; + smallest = XFS_ATTR_LEAF_MAPSIZE - 1; + entsize = xfs_attr_leaf_entsize(leaf, args->index); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + if (INT_GET(map->base, ARCH_CONVERT) == tablesize) { + INT_MOD(map->base, ARCH_CONVERT, + -sizeof(xfs_attr_leaf_entry_t)); + INT_MOD(map->size, ARCH_CONVERT, + sizeof(xfs_attr_leaf_entry_t)); + } + + if ((INT_GET(map->base, ARCH_CONVERT) + + INT_GET(map->size, ARCH_CONVERT)) + == INT_GET(entry->nameidx, ARCH_CONVERT)) { + before = i; + } else if (INT_GET(map->base, ARCH_CONVERT) + == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) { + after = i; + } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) { + tmp = INT_GET(map->size, ARCH_CONVERT); + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + INT_MOD(map->size, ARCH_CONVERT, + INT_GET(hdr->freemap[after].size, + ARCH_CONVERT)); + hdr->freemap[after].base = 0; + hdr->freemap[after].size = 0; + } else if (before >= 0) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + } else { + map = &hdr->freemap[after]; + /* both on-disk, don't endian flip twice */ + map->base = entry->nameidx; + INT_MOD(map->size, ARCH_CONVERT, entsize); + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + map = &hdr->freemap[smallest]; + if (INT_GET(map->size, ARCH_CONVERT) < entsize) { + INT_SET(map->base, ARCH_CONVERT, + INT_GET(entry->nameidx, ARCH_CONVERT)); + INT_SET(map->size, ARCH_CONVERT, entsize); + } + } + + /* + * Did we remove the first entry? + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) + == INT_GET(hdr->firstused, ARCH_CONVERT)) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize); + INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + entsize)); + + tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index) + * sizeof(xfs_attr_leaf_entry_t); + memmove((char *)entry, (char *)(entry+1), tmp); + INT_MOD(hdr->count, ARCH_CONVERT, -1); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)]; + memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = XFS_LBSIZE(mp); + entry = &leaf->entries[0]; + for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; + i >= 0; entry++, i--) { + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) + < XFS_LBSIZE(mp)); + if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp) + tmp = INT_GET(entry->nameidx, ARCH_CONVERT); + } + INT_SET(hdr->firstused, ARCH_CONVERT, tmp); + if (!hdr->firstused) { + INT_SET(hdr->firstused, ARCH_CONVERT, + tmp - XFS_ATTR_LEAF_NAME_ALIGN); + } + } else { + hdr->holes = 1; /* mark as needing compaction */ + } + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = sizeof(xfs_attr_leaf_hdr_t); + tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t); + tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT); + return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ +} + +/* + * Move all the attribute list entries from drop_leaf into save_leaf. + */ +void +xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; + xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; + xfs_mount_t *mp; + char *tmpbuffer; + + /* + * Set up environment. + */ + mp = state->mp; + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + drop_hdr = &drop_leaf->hdr; + save_hdr = &save_leaf->hdr; + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = + INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (save_hdr->holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + } else { + xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, + INT_GET(save_hdr->count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memset(tmpbuffer, 0, state->blocksize); + tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; + tmp_hdr = &tmp_leaf->hdr; + tmp_hdr->info = save_hdr->info; /* struct copy */ + tmp_hdr->count = 0; + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize); + if (!tmp_hdr->firstused) { + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, + state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); + } + tmp_hdr->usedbytes = 0; + if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, + INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(save_hdr->count, ARCH_CONVERT), + mp); + } else { + xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, + (int)INT_GET(save_hdr->count, ARCH_CONVERT), + mp); + xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, + INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), + mp); + } + memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); + kmem_free(tmpbuffer, state->blocksize); + } + + xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + state->blocksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + save_blk->hashval = + INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count, + ARCH_CONVERT)-1].hashval, + ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf attribute list structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in args->index the index into the entry[] array of either + * the found entry, or where the entry should have been (insert before + * that entry). + * + * Don't change the args->value unless we find the attribute. + */ +int +xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int probe, span; + xfs_dahash_t hashval; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2; + for (entry = &leaf->entries[probe]; span > 4; + entry = &leaf->entries[probe]) { + span /= 2; + if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval) + probe += span; + else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && + (!leaf->hdr.count + || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)))); + ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) + == hashval)); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) + >= hashval)) { + entry--; + probe--; + } + while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) { + entry++; + probe++; + } + if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) + || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) { + args->index = probe; + return(XFS_ERROR(ENOATTR)); + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + for ( ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval); + entry++, probe++) { +/* + * GROT: Add code to remove incomplete entries. + */ + /* + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ + if ((args->flags & XFS_ATTR_INCOMPLETE) != + (entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); + if (name_loc->namelen != args->namelen) + continue; + if (memcmp(args->name, (char *)name_loc->nameval, + args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((entry->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0)) + continue; + args->index = probe; + return(XFS_ERROR(EEXIST)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe); + if (name_rmt->namelen != args->namelen) + continue; + if (memcmp(args->name, (char *)name_rmt->name, + args->namelen) != 0) + continue; + if (((args->flags & ATTR_SECURE) != 0) != + ((entry->flags & XFS_ATTR_SECURE) != 0)) + continue; + if (((args->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0)) + continue; + args->index = probe; + args->rmtblkno + = INT_GET(name_rmt->valueblk, ARCH_CONVERT); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, + INT_GET(name_rmt->valuelen, + ARCH_CONVERT)); + return(XFS_ERROR(EEXIST)); + } + } + args->index = probe; + return(XFS_ERROR(ENOATTR)); +} + +/* + * Get the value associated with an attribute name from a leaf attribute + * list structure. + */ +int +xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +{ + int valuelen; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) + < (XFS_LBSIZE(args->dp->i_mount)/8)); + ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT))); + + entry = &leaf->entries[args->index]; + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + ASSERT(name_loc->namelen == args->namelen); + ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); + valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT); + args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT); + args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return(0); + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return(XFS_ERROR(ERANGE)); + } + args->valuelen = valuelen; + } + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/*ARGSUSED*/ +STATIC void +xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, + xfs_attr_leafblock_t *leaf_d, int start_d, + int count, xfs_mount_t *mp) +{ + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + xfs_attr_leaf_entry_t *entry_s, *entry_d; + int desti, tmp, i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) + && (INT_GET(hdr_s->count, ARCH_CONVERT) + < (XFS_LBSIZE(mp)/8))); + ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_s->count, ARCH_CONVERT) + * sizeof(*entry_s))+sizeof(*hdr_s))); + ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(*entry_d))+sizeof(*hdr_d))); + + ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT)); + ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT)); + ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT)); + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_d->entries[start_d]; + entry_d = &leaf_d->entries[start_d + count]; + memmove((char *)entry_d, (char *)entry_s, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate attribute info packed and in sequence. + */ + entry_s = &leaf_s->entries[start_s]; + entry_d = &leaf_d->entries[start_d]; + desti = start_d; + for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + >= INT_GET(hdr_s->firstused, ARCH_CONVERT)); + tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); +#ifdef GROT + /* + * Code to drop INCOMPLETE entries. Difficult to use as we + * may also need to change the insertion index. Code turned + * off for 6.2, should be revisited later. + */ + if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ + memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); + INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp); + INT_MOD(hdr_s->count, ARCH_CONVERT, -1); + entry_d--; /* to compensate for ++ in loop hdr */ + desti--; + if ((start_s + i) < offset) + result++; /* insertion index adjustment */ + } else { +#endif /* GROT */ + INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp); + /* both on-disk, don't endian flip twice */ + entry_d->hashval = entry_s->hashval; + /* both on-disk, don't endian flip twice */ + entry_d->nameidx = hdr_d->firstused; + entry_d->flags = entry_s->flags; + ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp + <= XFS_LBSIZE(mp)); + memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti), + XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp); + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp + <= XFS_LBSIZE(mp)); + memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); + INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp); + INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp); + INT_MOD(hdr_s->count, ARCH_CONVERT, -1); + INT_MOD(hdr_d->count, ARCH_CONVERT, 1); + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t) + + sizeof(xfs_attr_leaf_hdr_t); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp); +#ifdef GROT + } +#endif /* GROT */ + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) { + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + memset((char *)entry_s, 0, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[start_s + count]; + entry_d = &leaf_s->entries[start_s]; + memmove((char *)entry_d, (char *)entry_s, tmp); + + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &leaf_s->entries[INT_GET(hdr_s->count, + ARCH_CONVERT)]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + XFS_LBSIZE(mp))); + memset((char *)entry_s, 0, tmp); + } + + /* + * Fill in the freemap information + */ + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, + sizeof(xfs_attr_leaf_hdr_t)); + INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, + INT_GET(hdr_d->count, ARCH_CONVERT) + * sizeof(xfs_attr_leaf_entry_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, + INT_GET(hdr_d->firstused, ARCH_CONVERT) + - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + hdr_d->freemap[1].base = 0; + hdr_d->freemap[2].base = 0; + hdr_d->freemap[1].size = 0; + hdr_d->freemap[2].size = 0; + hdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +int +xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) && + (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC)); + if ( (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) + && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) + && ( (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) + || (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_attr_leafblock_t *leaf; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + if (count) + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + if (!leaf->hdr.count) + return(0); + return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count, + ARCH_CONVERT)-1].hashval, ARCH_CONVERT)); +} + +/* + * Calculate the number of bytes used to store the indicated attribute + * (whether local or remote only calculate bytes in this block). + */ +int +xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) +{ + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int size; + + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index); + size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen, + INT_GET(name_loc->valuelen, + ARCH_CONVERT)); + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index); + size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen); + } + return(size); +} + +/* + * Calculate the number of bytes that would be required to store the new + * attribute (whether local or remote only calculate bytes in this block). + * This routine decides as a side effect whether the attribute will be + * a "local" or a "remote" attribute. + */ +int +xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local) +{ + int size; + + size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen); + if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) { + if (local) { + *local = 1; + } + } else { + size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen); + if (local) { + *local = 0; + } + } + return(size); +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int retval, i; + + ASSERT(bp != NULL); + leaf = bp->data; + cursor = context->cursor; + cursor->initted = 1; + + xfs_attr_trace_l_cl("blk start", context, leaf); + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); + entry++, i++) { + if (INT_GET(entry->hashval, ARCH_CONVERT) + == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (INT_GET(entry->hashval, ARCH_CONVERT) + > cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) { + xfs_attr_trace_l_c("not found", context); + return(0); + } + } else { + entry = &leaf->entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for ( ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + && (retval == 0); entry++, i++) { + attrnames_t *namesp; + + if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) { + cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + if (((context->flags & ATTR_SECURE) != 0) != + ((entry->flags & XFS_ATTR_SECURE) != 0) && + !(context->flags & ATTR_KERNORMALS)) + continue; /* skip non-matching entries */ + if (((context->flags & ATTR_ROOT) != 0) != + ((entry->flags & XFS_ATTR_ROOT) != 0) && + !(context->flags & ATTR_KERNROOTLS)) + continue; /* skip non-matching entries */ + + namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure : + ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : + &attr_user); + + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += namesp->attr_namelen + + (int)name_loc->namelen + 1; + } else { + retval = xfs_attr_put_listent(context, namesp, + (char *)name_loc->nameval, + (int)name_loc->namelen, + (int)INT_GET(name_loc->valuelen, + ARCH_CONVERT)); + } + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (context->flags & ATTR_KERNOVAL) { + ASSERT(context->flags & ATTR_KERNAMELS); + context->count += namesp->attr_namelen + + (int)name_rmt->namelen + 1; + } else { + retval = xfs_attr_put_listent(context, namesp, + (char *)name_rmt->name, + (int)name_rmt->namelen, + (int)INT_GET(name_rmt->valuelen, + ARCH_CONVERT)); + } + } + if (retval == 0) { + cursor->offset++; + } + } + xfs_attr_trace_l_cl("blk end", context, leaf); + return(retval); +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +/*ARGSUSED*/ +int +xfs_attr_put_listent(xfs_attr_list_context_t *context, + attrnames_t *namesp, char *name, int namelen, int valuelen) +{ + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + if (context->flags & ATTR_KERNAMELS) { + char *offset; + + ASSERT(context->count >= 0); + + arraytop = context->count + namesp->attr_namelen + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return(1); + } + offset = (char *)context->alist + context->count; + strncpy(offset, namesp->attr_name, namesp->attr_namelen); + offset += namesp->attr_namelen; + strncpy(offset, name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += namesp->attr_namelen + namelen + 1; + return(0); + } + + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*context->alist)); + ASSERT(context->firstu <= context->bufsize); + + arraytop = sizeof(*context->alist) + + context->count * sizeof(context->alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + xfs_attr_trace_l_c("buffer full", context); + context->alist->al_more = 1; + return(1); + } + + aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[ namelen ] = 0; + context->alist->al_offset[ context->count++ ] = context->firstu; + context->alist->al_count = context->count; + xfs_attr_trace_l_c("add", context); + return(0); +} + +/*======================================================================== + * Manage the INCOMPLETE flag in a leaf entry + *========================================================================*/ + +/* + * Clear the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_clearflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen; + char *name; +#endif /* DEBUG */ + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); + +#ifdef DEBUG + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + namelen = name_loc->namelen; + name = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + namelen = name_rmt->namelen; + name = (char *)name_rmt->name; + } + ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval); + ASSERT(namelen == args->namelen); + ASSERT(memcmp(name, args->name, namelen) == 0); +#endif /* DEBUG */ + + entry->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + if (args->rmtblkno) { + ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno); + INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/* + * Set the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr_leaf_setflag(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp; + int error; + + /* + * Set up the operation. + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp != NULL); + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry = &leaf->entries[ args->index ]; + + ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); + entry->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + if ((entry->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/* + * In a single transaction, clear the INCOMPLETE flag on the leaf entry + * given by args->blkno/index and set the INCOMPLETE flag on the leaf + * entry given by args->blkno2/index2. + * + * Note that they could be in different blocks, or in the same block. + */ +int +xfs_attr_leaf_flipflags(xfs_da_args_t *args) +{ + xfs_attr_leafblock_t *leaf1, *leaf2; + xfs_attr_leaf_entry_t *entry1, *entry2; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_dabuf_t *bp1, *bp2; + int error; +#ifdef DEBUG + xfs_attr_leaf_name_local_t *name_loc; + int namelen1, namelen2; + char *name1, *name2; +#endif /* DEBUG */ + + /* + * Read the block containing the "old" attr + */ + error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, + XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp1 != NULL); + + /* + * Read the block containing the "new" attr, if it is different + */ + if (args->blkno2 != args->blkno) { + error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, + -1, &bp2, XFS_ATTR_FORK); + if (error) { + return(error); + } + ASSERT(bp2 != NULL); + } else { + bp2 = bp1; + } + + leaf1 = bp1->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT)); + ASSERT(args->index >= 0); + entry1 = &leaf1->entries[ args->index ]; + + leaf2 = bp2->data; + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT)); + ASSERT(args->index2 >= 0); + entry2 = &leaf2->entries[ args->index2 ]; + +#ifdef DEBUG + if (entry1->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index); + namelen1 = name_loc->namelen; + name1 = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + namelen1 = name_rmt->namelen; + name1 = (char *)name_rmt->name; + } + if (entry2->flags & XFS_ATTR_LOCAL) { + name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2); + namelen2 = name_loc->namelen; + name2 = (char *)name_loc->nameval; + } else { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + namelen2 = name_rmt->namelen; + name2 = (char *)name_rmt->name; + } + ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT)); + ASSERT(namelen1 == namelen2); + ASSERT(memcmp(name1, name2, namelen1) == 0); +#endif /* DEBUG */ + + ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); + ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); + + entry1->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); + if (args->rmtblkno) { + ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno); + INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen); + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); + } + + entry2->flags |= XFS_ATTR_INCOMPLETE; + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); + if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_da_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); + } + xfs_da_buf_done(bp1); + if (bp1 != bp2) + xfs_da_buf_done(bp2); + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_attr_rolltrans(&args->trans, args->dp); + + return(error); +} + +/*======================================================================== + * Indiscriminately delete the entire attribute fork + *========================================================================*/ + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +{ + xfs_da_blkinfo_t *info; + xfs_daddr_t blkno; + xfs_dabuf_t *bp; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return(error); + blkno = xfs_da_blkno(bp); + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->data; + if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, bp, 1); + } else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, bp); + } + if (error) + return(error); + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_attr_rolltrans(trans, dp); + + return (error); +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, count, i; + xfs_dabuf_t *child_bp; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + return(XFS_ERROR(EIO)); + } + + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) + == XFS_DA_NODE_MAGIC); + parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ + count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (!count) { + xfs_da_brelse(*trans, bp); + return(0); + } + child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT); + xfs_da_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = xfs_da_blkno(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->data; + if (INT_GET(info->magic, ARCH_CONVERT) + == XFS_DA_NODE_MAGIC) { + error = xfs_attr_node_inactive(trans, dp, + child_bp, level+1); + } else if (INT_GET(info->magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC) { + error = xfs_attr_leaf_inactive(trans, dp, + child_bp); + } else { + error = XFS_ERROR(EIO); + xfs_da_brelse(*trans, child_bp); + } + if (error) + return(error); + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return(error); + xfs_da_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if ((i+1) < count) { + error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return(error); + child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT); + xfs_da_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + if ((error = xfs_attr_rolltrans(trans, dp))) + return (error); + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +int +xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) +{ + xfs_attr_leafblock_t *leaf; + xfs_attr_leaf_entry_t *entry; + xfs_attr_leaf_name_remote_t *name_rmt; + xfs_attr_inactive_list_t *list, *lp; + int error, count, size, tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) + == XFS_ATTR_LEAF_MAGIC); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if ( INT_GET(entry->nameidx, ARCH_CONVERT) + && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_da_brelse(*trans, bp); + return(0); + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = &leaf->entries[0]; + for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) { + if ( INT_GET(entry->nameidx, ARCH_CONVERT) + && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + if (name_rmt->valueblk) { + /* both on-disk, don't endian flip twice */ + lp->valueblk = name_rmt->valueblk; + INT_SET(lp->valuelen, ARCH_CONVERT, + XFS_B_TO_FSB(dp->i_mount, + INT_GET(name_rmt->valuelen, + ARCH_CONVERT))); + lp++; + } + } + } + xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr_leaf_freextent(trans, dp, + INT_GET(lp->valueblk, + ARCH_CONVERT), + INT_GET(lp->valuelen, + ARCH_CONVERT)); + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free((xfs_caddr_t)list, size); + return(error); +} + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +int +xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dablk_t blkno, int blkcnt) +{ + xfs_bmbt_irec_t map; + xfs_dablk_t tblkno; + int tblkcnt, dblkcnt, nmap, error; + xfs_daddr_t dblkno; + xfs_buf_t *bp; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + NULL, 0, &map, &nmap, NULL); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, XFS_BUF_LOCK); + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + if ((error = xfs_attr_rolltrans(trans, dp))) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} + + +/* + * Roll from one trans in the sequence of PERMANENT transactions to the next. + */ +int +xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp) +{ + xfs_trans_t *trans; + unsigned int logres, count; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *transp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + logres = trans->t_log_res; + count = trans->t_log_count; + *transp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + if ((error = xfs_trans_commit(trans, 0, NULL))) + return (error); + + trans = *transp; + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + error = xfs_trans_reserve(trans, 0, logres, 0, + XFS_TRANS_PERM_LOG_RES, count); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (!error) { + xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); + xfs_trans_ihold(trans, dp); + } + return (error); + +} diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h new file mode 100644 index 00000000000..b1480e0b334 --- /dev/null +++ b/fs/xfs/xfs_attr_leaf.h @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_LEAF_H__ +#define __XFS_ATTR_LEAF_H__ + +/* + * Attribute storage layout, internal structure, access macros, etc. + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + */ + +struct attrlist; +struct attrlist_cursor_kern; +struct attrnames; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/*======================================================================== + * Attribute structure when equal to XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This is the structure of the leaf nodes in the Btree. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store a "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leafblock { + struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active leaf_entry's */ + __uint16_t usedbytes; /* num bytes of names/values stored */ + __uint16_t firstused; /* first used byte in name area */ + __uint8_t holes; /* != 0 if blk needs compaction */ + __uint8_t pad1; + struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __uint16_t base; /* base of free region */ + __uint16_t size; /* length of free region */ + } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */ + } hdr; + struct xfs_attr_leaf_entry { /* sorted on key, not name */ + xfs_dahash_t hashval; /* hash value of name */ + __uint16_t nameidx; /* index into buffer of name/value */ + __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __uint8_t pad2; /* unused pad byte */ + } entries[1]; /* variable sized array */ + struct xfs_attr_leaf_name_local { + __uint16_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t nameval[1]; /* name/value bytes */ + } namelist; /* grows from bottom of buf */ + struct xfs_attr_leaf_name_remote { + xfs_dablk_t valueblk; /* block number of value bytes */ + __uint32_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t name[1]; /* name bytes */ + } valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; +typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t; +typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t; +typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t; +typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t; +typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE) +xfs_attr_leaf_name_remote_t * +xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \ + xfs_attr_leaf_name_remote(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) /* remote name struct ptr */ \ + ((xfs_attr_leaf_name_remote_t *) \ + &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL) +xfs_attr_leaf_name_local_t * +xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \ + xfs_attr_leaf_name_local(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) /* local name struct ptr */ \ + ((xfs_attr_leaf_name_local_t *) \ + &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME) +char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx); +#define XFS_ATTR_LEAF_NAME(leafp,idx) xfs_attr_leaf_name(leafp,idx) +#else +#define XFS_ATTR_LEAF_NAME(leafp,idx) /* generic name struct ptr */ \ + (&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ]) +#endif + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE) +int xfs_attr_leaf_entsize_remote(int nlen); +#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \ + xfs_attr_leaf_entsize_remote(nlen) +#else +#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) /* space for remote struct */ \ + (((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL) +int xfs_attr_leaf_entsize_local(int nlen, int vlen); +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \ + xfs_attr_leaf_entsize_local(nlen,vlen) +#else +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) /* space for local struct */ \ + (((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX) +int xfs_attr_leaf_entsize_local_max(int bsize); +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \ + xfs_attr_leaf_entsize_local_max(bsize) +#else +#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) /* max local struct size */ \ + (((bsize) >> 1) + ((bsize) >> 2)) +#endif + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor;/* position in list */ + struct attrlist *alist; /* output buffer */ + int count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize;/* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch;/* T/F: resynch with cursor */ +} xfs_attr_list_context_t; + +/* + * Used to keep a list of "remote value" extents when unlinking an inode. + */ +typedef struct xfs_attr_inactive_list { + xfs_dablk_t valueblk; /* block number of value bytes */ + int valuelen; /* number of bytes in value */ +} xfs_attr_inactive_list_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when dirsize < XFS_LITINO(mp). + */ +int xfs_attr_shortform_create(struct xfs_da_args *args); +int xfs_attr_shortform_add(struct xfs_da_args *add); +int xfs_attr_shortform_lookup(struct xfs_da_args *args); +int xfs_attr_shortform_getvalue(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_remove(struct xfs_da_args *remove); +int xfs_attr_shortform_list(struct xfs_attr_list_context *context); +int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); + +/* + * Internal routines when dirsize == XFS_LBSIZE(mp). + */ +int xfs_attr_leaf_to_node(struct xfs_da_args *args); +int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, + struct xfs_da_args *args); +int xfs_attr_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr_leaf_setflag(struct xfs_da_args *args); +int xfs_attr_leaf_flipflags(xfs_da_args_t *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, + struct xfs_dabuf **bpp); +int xfs_attr_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, + struct xfs_da_args *args); +int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); +int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, + struct xfs_attr_list_context *context); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp, + struct xfs_dabuf *bp, int level); +int xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp, + struct xfs_dabuf *bp); +int xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp, + xfs_dablk_t blkno, int blkcnt); + +/* + * Utility routines. + */ +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize, + int *local); +int xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index); +int xfs_attr_put_listent(struct xfs_attr_list_context *context, + struct attrnames *, char *name, int namelen, + int valuelen); +int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); + +#endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h new file mode 100644 index 00000000000..ef7d2942d30 --- /dev/null +++ b/fs/xfs/xfs_attr_sf.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ATTR_SF_H__ +#define __XFS_ATTR_SF_H__ + +/* + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as + * to fit into the literal area of the inode. + */ + +struct xfs_inode; + +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __uint16_t totsize; /* total bytes in shortform list */ + __uint8_t count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; +typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; +typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; + +/* + * We generate this then sort it, attr_list() must return things in hash-order. + */ +typedef struct xfs_attr_sf_sort { + __uint8_t entno; /* entry number in original list */ + __uint8_t namelen; /* length of name value (no null) */ + __uint8_t valuelen; /* length of value */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + xfs_dahash_t hash; /* this entry's hash value */ + char *name; /* name value, pointer into buffer */ +} xfs_attr_sf_sort_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME) +int xfs_attr_sf_entsize_byname(int nlen, int vlen); +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) \ + xfs_attr_sf_entsize_byname(nlen,vlen) +#else +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)) +#endif +#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ + ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE) +int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep); +#define XFS_ATTR_SF_ENTSIZE(sfep) xfs_attr_sf_entsize(sfep) +#else +#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY) +xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep); +#define XFS_ATTR_SF_NEXTENTRY(sfep) xfs_attr_sf_nextentry(sfep) +#else +#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_attr_sf_entry_t *) \ + ((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE) +int xfs_attr_sf_totsize(struct xfs_inode *dp); +#define XFS_ATTR_SF_TOTSIZE(dp) xfs_attr_sf_totsize(dp) +#else +#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ + (INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT)) +#endif + +#if defined(XFS_ATTR_TRACE) +/* + * Kernel tracing support for attribute lists + */ +struct xfs_attr_list_context; +struct xfs_da_intnode; +struct xfs_da_node_entry; +struct xfs_attr_leafblock; + +#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */ +extern ktrace_t *xfs_attr_trace_buf; + +/* + * Trace record types. + */ +#define XFS_ATTR_KTRACE_L_C 1 /* context */ +#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */ +#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */ +#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */ + +void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context); +void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, + struct xfs_da_intnode *node); +void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, + struct xfs_da_node_entry *btree); +void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, + struct xfs_attr_leafblock *leaf); +void xfs_attr_trace_enter(int type, char *where, + __psunsigned_t a2, __psunsigned_t a3, + __psunsigned_t a4, __psunsigned_t a5, + __psunsigned_t a6, __psunsigned_t a7, + __psunsigned_t a8, __psunsigned_t a9, + __psunsigned_t a10, __psunsigned_t a11, + __psunsigned_t a12, __psunsigned_t a13, + __psunsigned_t a14, __psunsigned_t a15); +#else +#define xfs_attr_trace_l_c(w,c) +#define xfs_attr_trace_l_cn(w,c,n) +#define xfs_attr_trace_l_cb(w,c,b) +#define xfs_attr_trace_l_cl(w,c,l) +#endif /* XFS_ATTR_TRACE */ + +#endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c new file mode 100644 index 00000000000..16088e175ec --- /dev/null +++ b/fs/xfs/xfs_behavior.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + */ +#include "xfs.h" + +/* + * Source file used to associate/disassociate behaviors with virtualized + * objects. See xfs_behavior.h for more information about behaviors, etc. + * + * The implementation is split between functions in this file and macros + * in xfs_behavior.h. + */ + +/* + * Insert a new behavior descriptor into a behavior chain. + * + * The behavior chain is ordered based on the 'position' number which + * lives in the first field of the ops vector (higher numbers first). + * + * Attemps to insert duplicate ops result in an EINVAL return code. + * Otherwise, return 0 to indicate success. + */ +int +bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) +{ + bhv_desc_t *curdesc, *prev; + int position; + + /* + * Validate the position value of the new behavior. + */ + position = BHV_POSITION(bdp); + ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP); + + /* + * Find location to insert behavior. Check for duplicates. + */ + prev = NULL; + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + /* Check for duplication. */ + if (curdesc->bd_ops == bdp->bd_ops) { + ASSERT(0); + return EINVAL; + } + + /* Find correct position */ + if (position >= BHV_POSITION(curdesc)) { + ASSERT(position != BHV_POSITION(curdesc)); + break; /* found it */ + } + + prev = curdesc; + } + + if (prev == NULL) { + /* insert at front of chain */ + bdp->bd_next = bhp->bh_first; + bhp->bh_first = bdp; + } else { + /* insert after prev */ + bdp->bd_next = prev->bd_next; + prev->bd_next = bdp; + } + + return 0; +} + +/* + * Remove a behavior descriptor from a position in a behavior chain; + * the postition is guaranteed not to be the first position. + * Should only be called by the bhv_remove() macro. + */ +void +bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) +{ + bhv_desc_t *curdesc, *prev; + + ASSERT(bhp->bh_first != NULL); + ASSERT(bhp->bh_first->bd_next != NULL); + + prev = bhp->bh_first; + for (curdesc = bhp->bh_first->bd_next; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc == bdp) + break; /* found it */ + prev = curdesc; + } + + ASSERT(curdesc == bdp); + prev->bd_next = bdp->bd_next; /* remove from after prev */ +} + +/* + * Look for a specific ops vector on the specified behavior chain. + * Return the associated behavior descriptor. Or NULL, if not found. + */ +bhv_desc_t * +bhv_lookup(bhv_head_t *bhp, void *ops) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_ops == ops) + return curdesc; + } + + return NULL; +} + +/* + * Looks for the first behavior within a specified range of positions. + * Return the associated behavior descriptor. Or NULL, if none found. + */ +bhv_desc_t * +bhv_lookup_range(bhv_head_t *bhp, int low, int high) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + int position = BHV_POSITION(curdesc); + + if (position <= high) { + if (position >= low) + return curdesc; + return NULL; + } + } + + return NULL; +} + +/* + * Return the base behavior in the chain, or NULL if the chain + * is empty. + * + * The caller has not read locked the behavior chain, so acquire the + * lock before traversing the chain. + */ +bhv_desc_t * +bhv_base(bhv_head_t *bhp) +{ + bhv_desc_t *curdesc; + + for (curdesc = bhp->bh_first; + curdesc != NULL; + curdesc = curdesc->bd_next) { + + if (curdesc->bd_next == NULL) { + return curdesc; + } + } + + return NULL; +} + +void +bhv_head_init( + bhv_head_t *bhp, + char *name) +{ + bhp->bh_first = NULL; +} + +void +bhv_insert_initial( + bhv_head_t *bhp, + bhv_desc_t *bdp) +{ + ASSERT(bhp->bh_first == NULL); + (bhp)->bh_first = bdp; +} + +void +bhv_head_destroy( + bhv_head_t *bhp) +{ + ASSERT(bhp->bh_first == NULL); +} diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h new file mode 100644 index 00000000000..d5ed5a84392 --- /dev/null +++ b/fs/xfs/xfs_behavior.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BEHAVIOR_H__ +#define __XFS_BEHAVIOR_H__ + +/* + * Header file used to associate behaviors with virtualized objects. + * + * A virtualized object is an internal, virtualized representation of + * OS entities such as persistent files, processes, or sockets. Examples + * of virtualized objects include vnodes, vprocs, and vsockets. Often + * a virtualized object is referred to simply as an "object." + * + * A behavior is essentially an implementation layer associated with + * an object. Multiple behaviors for an object are chained together, + * the order of chaining determining the order of invocation. Each + * behavior of a given object implements the same set of interfaces + * (e.g., the VOP interfaces). + * + * Behaviors may be dynamically inserted into an object's behavior chain, + * such that the addition is transparent to consumers that already have + * references to the object. Typically, a given behavior will be inserted + * at a particular location in the behavior chain. Insertion of new + * behaviors is synchronized with operations-in-progress (oip's) so that + * the oip's always see a consistent view of the chain. + * + * The term "interpostion" is used to refer to the act of inserting + * a behavior such that it interposes on (i.e., is inserted in front + * of) a particular other behavior. A key example of this is when a + * system implementing distributed single system image wishes to + * interpose a distribution layer (providing distributed coherency) + * in front of an object that is otherwise only accessed locally. + * + * Note that the traditional vnode/inode combination is simply a virtualized + * object that has exactly one associated behavior. + * + * Behavior synchronization is logic which is necessary under certain + * circumstances that there is no conflict between ongoing operations + * traversing the behavior chain and those dunamically modifying the + * behavior chain. Because behavior synchronization adds extra overhead + * to virtual operation invocation, we want to restrict, as much as + * we can, the requirement for this extra code, to those situations + * in which it is truly necessary. + * + * Behavior synchronization is needed whenever there's at least one class + * of object in the system for which: + * 1) multiple behaviors for a given object are supported, + * -- AND -- + * 2a) insertion of a new behavior can happen dynamically at any time during + * the life of an active object, + * -- AND -- + * 3a) insertion of a new behavior needs to synchronize with existing + * ops-in-progress. + * -- OR -- + * 3b) multiple different behaviors can be dynamically inserted at + * any time during the life of an active object + * -- OR -- + * 3c) removal of a behavior can occur at any time during the life of + * an active object. + * -- OR -- + * 2b) removal of a behavior can occur at any time during the life of an + * active object + * + */ + +struct bhv_head_lock; + +/* + * Behavior head. Head of the chain of behaviors. + * Contained within each virtualized object data structure. + */ +typedef struct bhv_head { + struct bhv_desc *bh_first; /* first behavior in chain */ + struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */ +} bhv_head_t; + +/* + * Behavior descriptor. Descriptor associated with each behavior. + * Contained within the behavior's private data structure. + */ +typedef struct bhv_desc { + void *bd_pdata; /* private data for this behavior */ + void *bd_vobj; /* virtual object associated with */ + void *bd_ops; /* ops for this behavior */ + struct bhv_desc *bd_next; /* next behavior in chain */ +} bhv_desc_t; + +/* + * Behavior identity field. A behavior's identity determines the position + * where it lives within a behavior chain, and it's always the first field + * of the behavior's ops vector. The optional id field further identifies the + * subsystem responsible for the behavior. + */ +typedef struct bhv_identity { + __u16 bi_id; /* owning subsystem id */ + __u16 bi_position; /* position in chain */ +} bhv_identity_t; + +typedef bhv_identity_t bhv_position_t; + +#define BHV_IDENTITY_INIT(id,pos) {id, pos} +#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) + +/* + * Define boundaries of position values. + */ +#define BHV_POSITION_INVALID 0 /* invalid position number */ +#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ +#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ + +/* + * Plumbing macros. + */ +#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) +#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) +#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) +#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) +#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) +#define BHV_PDATA(bdp) (bdp)->bd_pdata +#define BHV_OPS(bdp) (bdp)->bd_ops +#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) +#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) + +extern void bhv_head_init(bhv_head_t *, char *); +extern void bhv_head_destroy(bhv_head_t *); +extern int bhv_insert(bhv_head_t *, bhv_desc_t *); +extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); + +/* + * Initialize a new behavior descriptor. + * Arguments: + * bdp - pointer to behavior descriptor + * pdata - pointer to behavior's private data + * vobj - pointer to associated virtual object + * ops - pointer to ops for this behavior + */ +#define bhv_desc_init(bdp, pdata, vobj, ops) \ + { \ + (bdp)->bd_pdata = pdata; \ + (bdp)->bd_vobj = vobj; \ + (bdp)->bd_ops = ops; \ + (bdp)->bd_next = NULL; \ + } + +/* + * Remove a behavior descriptor from a behavior chain. + */ +#define bhv_remove(bhp, bdp) \ + { \ + if ((bhp)->bh_first == (bdp)) { \ + /* \ + * Remove from front of chain. \ + * Atomic wrt oip's. \ + */ \ + (bhp)->bh_first = (bdp)->bd_next; \ + } else { \ + /* remove from non-front of chain */ \ + bhv_remove_not_first(bhp, bdp); \ + } \ + (bdp)->bd_vobj = NULL; \ + } + +/* + * Behavior module prototypes. + */ +extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); +extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); +extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); +extern bhv_desc_t * bhv_base(bhv_head_t *bhp); + +/* No bhv locking on Linux */ +#define bhv_lookup_unlocked bhv_lookup +#define bhv_base_unlocked bhv_base + +#endif /* __XFS_BEHAVIOR_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c new file mode 100644 index 00000000000..a20a6c3dc13 --- /dev/null +++ b/fs/xfs/xfs_bit.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +#include "xfs.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + + +#ifndef HAVE_ARCH_HIGHBIT +/* + * Index of high bit number in byte, -1 for none set, 0..7 otherwise. + */ +const char xfs_highbit[256] = { + -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ + 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ + 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ + 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */ + 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */ + 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */ + 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */ + 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */ + 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */ + 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */ + 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */ + 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */ + 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */ +}; +#endif + +/* + * Count of bits set in byte, 0..8. + */ +static const char xfs_countbit[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */ + 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */ + 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */ + 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */ + 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */ + 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */ + 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */ + 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */ + 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */ + 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */ + 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */ +}; + +/* + * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. + */ +inline int +xfs_highbit32( + __uint32_t v) +{ +#ifdef HAVE_ARCH_HIGHBIT + return highbit32(v); +#else + int i; + + if (v & 0xffff0000) + if (v & 0xff000000) + i = 24; + else + i = 16; + else if (v & 0x0000ffff) + if (v & 0x0000ff00) + i = 8; + else + i = 0; + else + return -1; + return i + xfs_highbit[(v >> i) & 0xff]; +#endif +} + +/* + * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set. + */ +int +xfs_lowbit64( + __uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; + + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w && (n = ffs(w))) + n += 32; + } + return n - 1; +} + +/* + * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set. + */ +int +xfs_highbit64( + __uint64_t v) +{ + __uint32_t h = (__uint32_t)(v >> 32); + + if (h) + return xfs_highbit32(h) + 32; + return xfs_highbit32((__uint32_t)v); +} + + +/* + * Count the number of bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + * + * Do the counting by mapping a byte value to the number of set + * bits for that value using the xfs_countbit array, i.e. + * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1, + * xfs_countbit[3] == 2, etc. + */ +int +xfs_count_bits(uint *map, uint size, uint start_bit) +{ + register int bits; + register unsigned char *bytep; + register unsigned char *end_map; + int byte_bit; + + bits = 0; + end_map = (char*)(map + size); + bytep = (char*)(map + (start_bit & ~0x7)); + byte_bit = start_bit & 0x7; + + /* + * If the caller fell off the end of the map, return 0. + */ + if (bytep >= end_map) { + return (0); + } + + /* + * If start_bit is not byte aligned, then process the + * first byte separately. + */ + if (byte_bit != 0) { + /* + * Shift off the bits we don't want to look at, + * before indexing into xfs_countbit. + */ + bits += xfs_countbit[(*bytep >> byte_bit)]; + bytep++; + } + + /* + * Count the bits in each byte until the end of the bitmap. + */ + while (bytep < end_map) { + bits += xfs_countbit[*bytep]; + bytep++; + } + + return (bits); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = 0; + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + ASSERT(start_bit < size); + size -= start_bit & ~(NBWORD - 1); + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to one first offset bits prior to start */ + tmp |= (~0U >> (NBWORD-start_bit)); + if (tmp != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return result - start_bit; +found: + return result + ffz(tmp) - start_bit; +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits prior to start */ + tmp &= (~0U << start_bit); + if (tmp != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return -1; +found: + return result + ffs(tmp) - 1; +} diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h new file mode 100644 index 00000000000..1e7f57ddf7a --- /dev/null +++ b/fs/xfs/xfs_bit.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BIT_H__ +#define __XFS_BIT_H__ + +/* + * XFS bit manipulation routines. + */ + +/* + * masks with n high/low bits set, 32-bit values & 64-bit values + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI) +__uint32_t xfs_mask32hi(int n); +#define XFS_MASK32HI(n) xfs_mask32hi(n) +#else +#define XFS_MASK32HI(n) ((__uint32_t)-1 << (32 - (n))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI) +__uint64_t xfs_mask64hi(int n); +#define XFS_MASK64HI(n) xfs_mask64hi(n) +#else +#define XFS_MASK64HI(n) ((__uint64_t)-1 << (64 - (n))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO) +__uint32_t xfs_mask32lo(int n); +#define XFS_MASK32LO(n) xfs_mask32lo(n) +#else +#define XFS_MASK32LO(n) (((__uint32_t)1 << (n)) - 1) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO) +__uint64_t xfs_mask64lo(int n); +#define XFS_MASK64LO(n) xfs_mask64lo(n) +#else +#define XFS_MASK64LO(n) (((__uint64_t)1 << (n)) - 1) +#endif + +/* Get high bit set out of 32-bit argument, -1 if none set */ +extern int xfs_highbit32(__uint32_t v); + +/* Get low bit set out of 64-bit argument, -1 if none set */ +extern int xfs_lowbit64(__uint64_t v); + +/* Get high bit set out of 64-bit argument, -1 if none set */ +extern int xfs_highbit64(__uint64_t); + +/* Count set bits in map starting with start_bit */ +extern int xfs_count_bits(uint *map, uint size, uint start_bit); + +/* Count continuous one bits in map starting with start_bit */ +extern int xfs_contig_bits(uint *map, uint size, uint start_bit); + +/* Find next set bit in map */ +extern int xfs_next_bit(uint *map, uint size, uint start_bit); + +#endif /* __XFS_BIT_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c new file mode 100644 index 00000000000..de316241866 --- /dev/null +++ b/fs/xfs/xfs_bmap.c @@ -0,0 +1,6246 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_bit.h" +#include "xfs_rw.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_buf_item.h" + + +#ifdef DEBUG +STATIC void +xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); +#endif + +kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Prototypes for internal bmap routines. + */ + + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags); /* inode logging flags */ + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp,/* inode logging flags */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp); /* inode logging flags */ + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int /* error */ +xfs_bmap_alloc( + xfs_bmalloca_t *ap); /* bmap alloc argument struct */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the extent list is already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +#ifdef DEBUG +/* + * Check that the extents list for the inode ip is in the right order. + */ +STATIC void +xfs_bmap_check_extents( + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork); /* data or attr fork */ +#endif + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current trans pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp,/* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd); /* OK to allocate reserved blocks */ + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free); /* list item to be freed */ + +/* + * Remove count entries from the extents array for inode "ip", starting + * at index "idx". Copies the remaining items down over the deleted ones, + * and gives back the excess memory. + */ +STATIC void +xfs_bmap_delete_exlist( + xfs_inode_t *ip, /* incode inode pointer */ + xfs_extnum_t idx, /* starting delete index */ + xfs_extnum_t count, /* count of items to delete */ + int whichfork); /* data or attr fork */ + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Insert new item(s) in the extent list for inode "ip". + * Count new items are inserted at offset idx. + */ +STATIC void +xfs_bmap_insert_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int whichfork); /* data or attr fork */ + +/* + * Convert a local file to an extents file. + * This code is sort of bogus, since the file data needs to get + * logged so it won't be lost. The bmap-level manipulations are ok, though. + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork); /* data or attr fork */ + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int whichfork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ + +#ifdef XFS_BMAP_TRACE +/* + * Add a bmap trace buffer entry. Base routine for the others. + */ +STATIC void +xfs_bmap_trace_addentry( + int opcode, /* operation */ + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(ies) */ + xfs_extnum_t cnt, /* count of entries, 1 or 2 */ + xfs_bmbt_rec_t *r1, /* first record */ + xfs_bmbt_rec_t *r2, /* second record or null */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist. + */ +STATIC void +xfs_bmap_trace_delete( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) deleted */ + xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or + * reading in the extents list from the disk (in the btree). + */ +STATIC void +xfs_bmap_trace_insert( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) inserted */ + xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ + xfs_bmbt_irec_t *r1, /* inserted record 1 */ + xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry after updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_post_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry updated */ + int whichfork); /* data or attr fork */ + +/* + * Add bmap trace entry prior to updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_pre_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry to be updated */ + int whichfork); /* data or attr fork */ + +#else +#define xfs_bmap_trace_delete(f,d,ip,i,c,w) +#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) +#define xfs_bmap_trace_post_update(f,d,ip,i,w) +#define xfs_bmap_trace_pre_update(f,d,ip,i,w) +#endif /* XFS_BMAP_TRACE */ + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len); /* delayed extent length */ + +#ifdef DEBUG +/* + * Perform various validation checks on the values being returned + * from xfs_bmapi(). + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap); +#else +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +#if defined(XFS_RW_TRACE) +STATIC void +xfs_bunmap_trace( + xfs_inode_t *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + inst_t *ra); +#else +#define xfs_bunmap_trace(ip, bno, len, flags, ra) +#endif /* XFS_RW_TRACE */ + +STATIC int +xfs_bmap_count_tree( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_fsblock_t blockno, + int levelin, + int *count); + +STATIC int +xfs_bmap_count_leaves( + xfs_bmbt_rec_t *frp, + int numrecs, + int *count); + +/* + * Bmap internal routines. + */ + +/* + * Called from xfs_bmap_add_attrfork to handle btree format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int error; /* error return value */ + xfs_mount_t *mp; /* file system mount struct */ + int stat; /* newroot status */ + + mp = ip->i_mount; + if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + *flags |= XFS_ILOG_DBROOT; + else { + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + XFS_DATA_FORK); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + goto error0; + ASSERT(stat == 1); /* must be at least one entry */ + if ((error = xfs_bmbt_newroot(cur, flags, &stat))) + goto error0; + if (stat == 0) { + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return XFS_ERROR(ENOSPC); + } + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + } + return 0; +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + int error; /* error return value */ + + if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + return 0; + cur = NULL; + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + flags, XFS_DATA_FORK); + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_da_args_t dargs; /* args for dir/attr code */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount structure pointer */ + + if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + return 0; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + mp = ip->i_mount; + memset(&dargs, 0, sizeof(dargs)); + dargs.dp = ip; + dargs.firstblock = firstblock; + dargs.flist = flist; + dargs.total = mp->m_dirblkfsbs; + dargs.whichfork = XFS_DATA_FORK; + dargs.trans = tp; + error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs); + } else + error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK); + return error; +} + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after allocating space (or doing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_add_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to use reserved data blocks */ +{ + xfs_btree_cur_t *cur; /* btree cursor or null */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent"; +#endif + xfs_ifork_t *ifp; /* inode fork ptr */ + int logflags; /* returned value */ + xfs_extnum_t nextents; /* number of extents in file now */ + + XFS_STATS_INC(xs_add_exlist); + cur = *curp; + ifp = XFS_IFORK_PTR(ip, whichfork); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(idx <= nextents); + da_old = da_new = 0; + error = 0; + /* + * This is the first extent added to a new/empty file. + * Special case this one, so other routines get to assume there are + * already extents in the list. + */ + if (nextents == 0) { + xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, + NULL, whichfork); + xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork); + ASSERT(cur == NULL); + ifp->if_lastex = 0; + if (!ISNULLSTARTBLOCK(new->br_startblock)) { + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + } else + logflags = 0; + } + /* + * Any kind of new delayed allocation goes here. + */ + else if (ISNULLSTARTBLOCK(new->br_startblock)) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new, + &logflags, rsvd))) + goto done; + } + /* + * Real allocation off the end of the file. + */ + else if (idx == nextents) { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, + &logflags, whichfork))) + goto done; + } else { + xfs_bmbt_irec_t prev; /* old extent at offset idx */ + + /* + * Get the record referred to by idx. + */ + xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev); + /* + * If it's a real allocation record, and the new allocation ends + * after the start of the referred to record, then we're filling + * in a delayed or unwritten allocation with a real one, or + * converting real back to unwritten. + */ + if (!ISNULLSTARTBLOCK(new->br_startblock) && + new->br_startoff + new->br_blockcount > prev.br_startoff) { + if (prev.br_state != XFS_EXT_UNWRITTEN && + ISNULLSTARTBLOCK(prev.br_startblock)) { + da_old = STARTBLOCKVAL(prev.br_startblock); + if (cur) + ASSERT(cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL); + if ((error = xfs_bmap_add_extent_delay_real(ip, + idx, &cur, new, &da_new, first, flist, + &logflags, rsvd))) + goto done; + } else if (new->br_state == XFS_EXT_NORM) { + ASSERT(new->br_state == XFS_EXT_NORM); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) + goto done; + } else { + ASSERT(new->br_state == XFS_EXT_UNWRITTEN); + if ((error = xfs_bmap_add_extent_unwritten_real( + ip, idx, &cur, new, &logflags))) + goto done; + } + ASSERT(*curp == cur || *curp == NULL); + } + /* + * Otherwise we're filling in a hole with an allocation. + */ + else { + if (cur) + ASSERT((cur->bc_private.b.flags & + XFS_BTCUR_BPRV_WASDEL) == 0); + if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, + new, &logflags, whichfork))) + goto done; + } + } + + ASSERT(*curp == cur || *curp == NULL); + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first, + flist, &cur, da_old > 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto done; + } + /* + * Adjust for changes in reserved delayed indirect blocks. + * Nothing to do for disk quotas here. + */ + if (da_old || da_new) { + xfs_filblks_t nblks; + + nblks = da_new; + if (cur) + nblks += cur->bc_private.b.allocated; + ASSERT(nblks <= da_old); + if (nblks < da_old) + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + (int)(da_old - nblks), rsvd); + } + /* + * Clear out the allocated field, done with it now in any case. + */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } +done: +#ifdef DEBUG + if (!error) + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); +#endif + *logflagsp = logflags; + return error; +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a delayed + * allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to use reserved data block allocation */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_btree_cur_t *cur; /* btree cursor */ + int diff; /* temp value */ + xfs_bmbt_rec_t *ep; /* extent entry for idx */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_delay_real"; +#endif + int i; /* temp state */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + xfs_filblks_t temp; /* value for dnew calculations */ + xfs_filblks_t temp2; /* value for dnew calculations */ + int tmp_rval; /* partial logging flags */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_FILLING, RIGHT_FILLING, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) +#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE \ + (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) + + /* + * Set up a bunch of variables to make the tests simpler. + */ + cur = *curp; + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_get_all(ep, &PREV); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* + * Set flags determining what part of the previous delayed allocation + * extent is being replaced by a real allocation. + */ + STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); + STATE_SET(RIGHT_FILLING, + PREV.br_startoff + PREV.br_blockcount == new_endoff); + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &LEFT); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + } + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + xfs_bmbt_get_all(ep + 1, &RIGHT); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + } + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == + RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != + MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)); + error = 0; + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (SWITCH_STATE) { + + case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The left and right neighbors are both contiguous with new. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The left neighbor is contiguous, the right is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount, LEFT.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + /* + * Filling in all of a previously delayed allocation extent. + * The right neighbor is contiguous, the left is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK); + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + new->br_startblock, + PREV.br_blockcount + + RIGHT.br_blockcount, PREV.br_state))) + goto done; + } + *dnew = 0; + break; + + case MASK2(LEFT_FILLING, RIGHT_FILLING): + /* + * Filling in all of a previously delayed allocation extent. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + *dnew = 0; + break; + + case MASK2(LEFT_FILLING, LEFT_CONTIG): + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + ip->i_df.if_lastex = idx - 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + new->br_blockcount, + LEFT.br_state))) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK(LEFT_FILLING): + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_startoff(ep, new_endoff); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + base = ip->i_df.if_u1.if_extents; + ep = &base[idx + 1]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is contiguous with the new allocation. + */ + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + RIGHT.br_state); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + RIGHT.br_blockcount, + RIGHT.br_state))) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + *dnew = temp; + break; + + case MASK(RIGHT_FILLING): + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is not contiguous. + */ + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, + new, NULL, XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + *dnew = temp; + break; + + case 0: + /* + * Filling in the middle part of a previous delayed allocation. + * Contiguity is impossible here. + * This case is avoided almost all the time. + */ + temp = new->br_startoff - PREV.br_startoff; + xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, temp); + r[0] = *new; + r[1].br_startoff = new_endoff; + temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_blockcount = temp2; + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_nextents > ip->i_df.if_ext_max) { + error = xfs_bmap_extents_to_btree(ip->i_transp, ip, + first, flist, &cur, 1, &tmp_rval, + XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = xfs_bmap_worst_indlen(ip, temp); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - + (cur ? cur->bc_private.b.allocated : 0)); + if (diff > 0 && + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) { + /* + * Ick gross gag me with a spoon. + */ + ASSERT(0); /* want to see if this ever happens! */ + while (diff > 0) { + if (temp) { + temp--; + diff--; + if (!diff || + !xfs_mod_incore_sb(ip->i_mount, + XFS_SBS_FDBLOCKS, -diff, rsvd)) + break; + } + if (temp2) { + temp2--; + diff--; + if (!diff || + !xfs_mod_incore_sb(ip->i_mount, + XFS_SBS_FDBLOCKS, -diff, rsvd)) + break; + } + } + } + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2)); + xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, + XFS_DATA_FORK); + *dnew = temp + temp2; + break; + + case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK2(LEFT_FILLING, RIGHT_CONTIG): + case MASK2(RIGHT_FILLING, LEFT_CONTIG): + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + case MASK(LEFT_CONTIG): + case MASK(RIGHT_CONTIG): + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +#undef MASK +#undef MASK2 +#undef MASK3 +#undef MASK4 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting an unwritten + * allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp) /* inode logging flags */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_btree_cur_t *cur; /* btree cursor */ + xfs_bmbt_rec_t *ep; /* extent entry for idx */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_unwritten_real"; +#endif + int i; /* temp state */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_exntst_t newext; /* new extent state */ + xfs_exntst_t oldext; /* old extent state */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_FILLING, RIGHT_FILLING, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) +#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE \ + (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) + + /* + * Set up a bunch of variables to make the tests simpler. + */ + error = 0; + cur = *curp; + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + xfs_bmbt_get_all(ep, &PREV); + newext = new->br_state; + oldext = (newext == XFS_EXT_UNWRITTEN) ? + XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + ASSERT(PREV.br_state == oldext); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); + STATE_SET(RIGHT_FILLING, + PREV.br_startoff + PREV.br_blockcount == new_endoff); + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &LEFT); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + } + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + xfs_bmbt_get_all(ep + 1, &RIGHT); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + } + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == + RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != + MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (SWITCH_STATE) { + + case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + ip->i_d.di_nextents -= 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + PREV.br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount, + LEFT.br_state))) + goto done; + } + break; + + case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmbt_set_state(ep, newext); + xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case MASK2(LEFT_FILLING, RIGHT_FILLING): + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_state(ep, newext); + xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + newext))) + goto done; + } + break; + + case MASK2(LEFT_FILLING, LEFT_CONTIG): + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + goto done; + if (xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + new->br_blockcount, + LEFT.br_state)) + goto done; + } + break; + + case MASK(LEFT_FILLING): + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); + xfs_bmbt_set_startoff(ep, new_endoff); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + cur->bc_rec.b = *new; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, + XFS_DATA_FORK); + xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, newext); + xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case MASK(RIGHT_FILLING): + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); + xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, + new, NULL, XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + ASSERT(i == 0); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep, + new->br_startoff - PREV.br_startoff); + xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + r[0] = *new; + r[1].br_startoff = new_endoff; + r[1].br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_startblock = new->br_startblock + new->br_blockcount; + r[1].br_state = oldext; + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK); + ip->i_df.if_lastex = idx + 1; + ip->i_d.di_nextents += 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + /* new right extent - oldext */ + if ((error = xfs_bmbt_update(cur, r[1].br_startoff, + r[1].br_startblock, r[1].br_blockcount, + r[1].br_state))) + goto done; + /* new left extent - oldext */ + PREV.br_blockcount = + new->br_startoff - PREV.br_startoff; + cur->bc_rec.b = PREV; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + ASSERT(i == 1); + /* new middle extent - newext */ + cur->bc_rec.b = *new; + if ((error = xfs_bmbt_insert(cur, &i))) + goto done; + ASSERT(i == 1); + } + break; + + case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + case MASK2(LEFT_FILLING, RIGHT_CONTIG): + case MASK2(RIGHT_FILLING, LEFT_CONTIG): + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + case MASK(LEFT_CONTIG): + case MASK(RIGHT_CONTIG): + /* + * These cases are all impossible. + */ + ASSERT(0); + } + *curp = cur; +done: + *logflagsp = rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +#undef MASK +#undef MASK2 +#undef MASK3 +#undef MASK4 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a delayed allocation. + */ +/*ARGSUSED*/ +STATIC int /* error */ +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int rsvd) /* OK to allocate reserved blocks */ +{ + xfs_bmbt_rec_t *base; /* base of extent entry list */ + xfs_bmbt_rec_t *ep; /* extent list entry for idx */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_hole_delay"; +#endif + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + xfs_filblks_t temp; /* temp for indirect calculations */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) + + base = ip->i_df.if_u1.if_extents; + ep = &base[idx]; + state = 0; + ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); + /* + * Check and set flags if this segment has a left neighbor + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &left); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + } + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, &right); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + } + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN); + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!STATE_TEST(LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))); + /* + * Switch out based on the contiguity flags. + */ + switch (SWITCH_STATE) { + + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent list entry. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, temp); + oldlen = STARTBLOCKVAL(left.br_startblock) + + STARTBLOCKVAL(new->br_startblock) + + STARTBLOCKVAL(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen)); + xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1, + XFS_DATA_FORK); + xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + break; + + case MASK(LEFT_CONTIG): + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, + XFS_DATA_FORK); + xfs_bmbt_set_blockcount(ep - 1, temp); + oldlen = STARTBLOCKVAL(left.br_startblock) + + STARTBLOCKVAL(new->br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen)); + xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, + XFS_DATA_FORK); + ip->i_df.if_lastex = idx - 1; + break; + + case MASK(RIGHT_CONTIG): + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); + temp = new->br_blockcount + right.br_blockcount; + oldlen = STARTBLOCKVAL(new->br_startblock) + + STARTBLOCKVAL(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_allf(ep, new->br_startoff, + NULLSTARTBLOCK((int)newlen), temp, right.br_state); + xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + XFS_DATA_FORK); + xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK); + ip->i_df.if_lastex = idx; + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + (int)(oldlen - newlen), rsvd); + /* + * Nothing to do for disk quota accounting here. + */ + } + *logflagsp = 0; + return 0; +#undef MASK +#undef MASK2 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE +} + +/* + * Called by xfs_bmap_add_extent to handle cases converting a hole + * to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* extent number to update/insert */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to put in extent list */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ + int error; /* error return value */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_add_extent_hole_real"; +#endif + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + enum { /* bit number definitions for state */ + LEFT_CONTIG, RIGHT_CONTIG, + LEFT_DELAY, RIGHT_DELAY, + LEFT_VALID, RIGHT_VALID + }; + +#define MASK(b) (1 << (b)) +#define MASK2(a,b) (MASK(a) | MASK(b)) +#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) +#define STATE_TEST(b) (state & MASK(b)) +#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ + ((state &= ~MASK(b)), 0)) +#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); + ep = &ifp->if_u1.if_extents[idx]; + state = 0; + /* + * Check and set flags if this segment has a left neighbor. + */ + if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + xfs_bmbt_get_all(ep - 1, &left); + STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + } + /* + * Check and set flags if this segment has a current value. + * Not true if we're inserting into the "hole" at eof. + */ + if (STATE_SET_TEST(RIGHT_VALID, + idx < + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, &right); + STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + } + /* + * We're inserting a real allocation between "left" and "right". + * Set the contiguity flags. Don't let extents get too large. + */ + STATE_SET(LEFT_CONTIG, + STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN); + STATE_SET(RIGHT_CONTIG, + STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == + right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!STATE_TEST(LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)); + + /* + * Select which case we're in here, and implement it. + */ + switch (SWITCH_STATE) { + + case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + /* + * New allocation is contiguous with real allocations on the + * left and on the right. + * Merge all three into a single extent list entry. + */ + xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, + whichfork); + xfs_bmbt_set_blockcount(ep - 1, + left.br_blockcount + new->br_blockcount + + right.br_blockcount); + xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, + whichfork); + xfs_bmap_trace_delete(fname, "LC|RC", ip, + idx, 1, whichfork); + xfs_bmap_delete_exlist(ip, idx, 1, whichfork); + ifp->if_lastex = idx - 1; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff, + right.br_startblock, right.br_blockcount, &i))) + return error; + ASSERT(i == 1); + if ((error = xfs_bmbt_delete(cur, &i))) + return error; + ASSERT(i == 1); + if ((error = xfs_bmbt_decrement(cur, 0, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + new->br_blockcount + + right.br_blockcount, left.br_state); + return error; + + case MASK(LEFT_CONTIG): + /* + * New allocation is contiguous with a real allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); + xfs_bmbt_set_blockcount(ep - 1, + left.br_blockcount + new->br_blockcount); + xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); + ifp->if_lastex = idx - 1; + if (cur == NULL) { + *logflagsp = XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = 0; + if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff, + left.br_startblock, left.br_blockcount, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + new->br_blockcount, + left.br_state); + return error; + + case MASK(RIGHT_CONTIG): + /* + * New allocation is contiguous with a real allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); + xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); + ifp->if_lastex = idx; + if (cur == NULL) { + *logflagsp = XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = 0; + if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff, + right.br_startblock, right.br_blockcount, &i))) + return error; + ASSERT(i == 1); + error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + return error; + + case 0: + /* + * New allocation is not contiguous with another + * real allocation. + * Insert a new entry. + */ + xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, + whichfork); + xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork); + ifp->if_lastex = idx; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; + } + *logflagsp = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, &i))) + return error; + ASSERT(i == 0); + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_bmbt_insert(cur, &i))) + return error; + ASSERT(i == 1); + return 0; + } +#undef MASK +#undef MASK2 +#undef STATE_SET +#undef STATE_TEST +#undef STATE_SET_TEST +#undef SWITCH_STATE + /* NOTREACHED */ + ASSERT(0); + return 0; /* keep gcc quite */ +} + +#define XFS_ALLOC_GAP_UNITS 4 + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int /* error */ +xfs_bmap_alloc( + xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +{ + xfs_fsblock_t adjust; /* adjustment to block numbers */ + xfs_alloctype_t atype=0; /* type for allocation routines */ + int error; /* error return value */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_mount_t *mp; /* mount point structure */ + int nullfb; /* true if ap->firstblock isn't set */ + int rt; /* true if inode is realtime */ +#ifdef __KERNEL__ + xfs_extlen_t prod=0; /* product factor for allocators */ + xfs_extlen_t ralen=0; /* realtime allocation length */ +#endif + +#define ISVALID(x,y) \ + (rt ? \ + (x) < mp->m_sb.sb_rblocks : \ + XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) + + /* + * Set up variables. + */ + mp = ap->ip->i_mount; + nullfb = ap->firstblock == NULLFSBLOCK; + rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); +#ifdef __KERNEL__ + if (rt) { + xfs_extlen_t extsz; /* file extent size for rt */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_extlen_t orig_alen; /* original ap->alen */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t orig_off; /* original ap->off */ + xfs_extlen_t mod_off; /* modulus calculations */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_rtblock_t rtx; /* realtime extent number */ + xfs_extlen_t temp; /* temp for rt calculations */ + + /* + * Set prod to match the realtime extent size. + */ + if (!(extsz = ap->ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + prod = extsz / mp->m_sb.sb_rextsize; + orig_off = ap->off; + orig_alen = ap->alen; + orig_end = orig_off + orig_alen; + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment. + */ + mod_off = do_mod(orig_off, extsz); + if (mod_off) { + ap->alen += mod_off; + ap->off -= mod_off; + } + /* + * Same adjustment for the end of the requested area. + */ + if ((temp = (ap->alen % extsz))) + ap->alen += extsz - temp; + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + prevo = + ap->prevp->br_startoff == NULLFILEOFF ? + 0 : + (ap->prevp->br_startoff + + ap->prevp->br_blockcount); + if (ap->off != orig_off && ap->off < prevo) + ap->off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ? + NULLFILEOFF : ap->gotp->br_startoff; + if (!ap->eof && + ap->off + ap->alen != orig_end && + ap->off + ap->alen > nexto) + ap->off = nexto > ap->alen ? nexto - ap->alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first valid spot and set + * the length so we hit the end. + */ + if ((ap->off != orig_off && ap->off < prevo) || + (ap->off + ap->alen != orig_end && + ap->off + ap->alen > nexto)) { + ap->off = prevo; + ap->alen = nexto - prevo; + } + /* + * If the result isn't a multiple of rtextents we need to + * remove blocks until it is. + */ + if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < ap->off || + orig_end > ap->off + ap->alen || + ap->alen - temp < orig_alen) + return XFS_ERROR(EINVAL); + /* + * Try to fix it by moving the start up. + */ + if (ap->off + temp <= orig_off) { + ap->alen -= temp; + ap->off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (ap->off + ap->alen - temp >= orig_end) + ap->alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + ap->alen -= orig_off - ap->off; + ap->off = orig_off; + ap->alen -= ap->alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < ap->off || orig_end > ap->off + ap->alen) + return XFS_ERROR(EINVAL); + } + ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->off, extsz) || ap->alen % extsz) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->alen / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->off == 0) { + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->rval = rtx * mp->m_sb.sb_rextsize; + } else + ap->rval = 0; + } +#else + if (rt) + ap->rval = 0; +#endif /* __KERNEL__ */ + else if (nullfb) + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + else + ap->rval = ap->firstblock; + /* + * If allocating at eof, and there's a previous real block, + * try to use it's last block as our starting point. + */ + if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && + !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, + ap->prevp->br_startblock)) { + ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + /* + * Adjust for the gap between prevp and us. + */ + adjust = ap->off - + (ap->prevp->br_startoff + ap->prevp->br_blockcount); + if (adjust && + ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) + ap->rval += adjust; + } + /* + * If not at eof, then compare the two neighbor blocks. + * Figure out whether either one gives us a good starting point, + * and pick the better one. + */ + else if (!ap->eof) { + xfs_fsblock_t gotbno; /* right side block number */ + xfs_fsblock_t gotdiff=0; /* right side difference */ + xfs_fsblock_t prevbno; /* left side block number */ + xfs_fsblock_t prevdiff=0; /* left side difference */ + + /* + * If there's a previous (left) block, select a requested + * start block based on it. + */ + if (ap->prevp->br_startoff != NULLFILEOFF && + !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + (prevbno = ap->prevp->br_startblock + + ap->prevp->br_blockcount) && + ISVALID(prevbno, ap->prevp->br_startblock)) { + /* + * Calculate gap to end of previous block. + */ + adjust = prevdiff = ap->off - + (ap->prevp->br_startoff + + ap->prevp->br_blockcount); + /* + * Figure the startblock based on the previous block's + * end and the gap size. + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the end of the previous block. + */ + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISVALID(prevbno + prevdiff, + ap->prevp->br_startblock)) + prevbno += adjust; + else + prevdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) + prevbno = NULLFSBLOCK; + } + /* + * No previous block or can't follow it, just default. + */ + else + prevbno = NULLFSBLOCK; + /* + * If there's a following (right) block, select a requested + * start block based on it. + */ + if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { + /* + * Calculate gap to start of next block. + */ + adjust = gotdiff = ap->gotp->br_startoff - ap->off; + /* + * Figure the startblock based on the next block's + * start and the gap size. + */ + gotbno = ap->gotp->br_startblock; + /* + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the start of the next block + * offset by our length. + */ + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + ISVALID(gotbno - gotdiff, gotbno)) + gotbno -= adjust; + else if (ISVALID(gotbno - ap->alen, gotbno)) { + gotbno -= ap->alen; + gotdiff += adjust - ap->alen; + } else + gotdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) + gotbno = NULLFSBLOCK; + } + /* + * No next block, just default. + */ + else + gotbno = NULLFSBLOCK; + /* + * If both valid, pick the better one, else the only good + * one, else ap->rval is already set (to 0 or the inode block). + */ + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + else if (prevbno != NULLFSBLOCK) + ap->rval = prevbno; + else if (gotbno != NULLFSBLOCK) + ap->rval = gotbno; + } + /* + * If allowed, use ap->rval; otherwise must use firstblock since + * it's in the right allocation group. + */ + if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + ; + else + ap->rval = ap->firstblock; + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + if (rt) { +#ifndef __KERNEL__ + ASSERT(0); +#else + xfs_rtblock_t rtb; + + atype = ap->rval == 0 ? + XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->rval, mp->m_sb.sb_rextsize); + rtb = ap->rval; + ap->alen = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, + ap->alen, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->rval = rtb; + if (ap->rval != NULLFSBLOCK) { + ap->rval *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->alen = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, + (long) ralen); + } else + ap->alen = 0; +#endif /* __KERNEL__ */ + } + /* + * Normal allocation, done through xfs_alloc_vextent. + */ + else { + xfs_agnumber_t ag; + xfs_alloc_arg_t args; + xfs_extlen_t blen; + xfs_extlen_t delta; + int isaligned; + xfs_extlen_t longest; + xfs_extlen_t need; + xfs_extlen_t nextminlen=0; + int notinit; + xfs_perag_t *pag; + xfs_agnumber_t startag; + int tryagain; + + tryagain = isaligned = 0; + args.tp = ap->tp; + args.mp = mp; + args.fsbno = ap->rval; + args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + blen = 0; + if (nullfb) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.total = ap->total; + /* + * Find the longest available space. + * We're going to try for the whole allocation at once. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); + notinit = 0; + down_read(&mp->m_peraglock); + while (blen < ap->alen) { + pag = &mp->m_perag[ag]; + if (!pag->pagf_init && + (error = xfs_alloc_pagf_init(mp, args.tp, + ag, XFS_ALLOC_FLAG_TRYLOCK))) { + up_read(&mp->m_peraglock); + return error; + } + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? + need - pag->pagf_flcount : 0; + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || + pag->pagf_longest > 0); + if (blen < longest) + blen = longest; + } else + notinit = 1; + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + } + up_read(&mp->m_peraglock); + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || blen < ap->minlen) + args.minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (blen < ap->alen) + args.minlen = blen; + /* + * Otherwise we've seen an extent as big as alen, + * use that as the minimum. + */ + else + args.minlen = ap->alen; + } else if (ap->low) { + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = args.minlen = ap->minlen; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.total = ap->total; + args.minlen = ap->minlen; + } + if (ap->ip->i_d.di_extsize) { + args.prod = ap->ip->i_d.di_extsize; + if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } else if (mp->m_sb.sb_blocksize >= NBPP) { + args.prod = 1; + args.mod = 0; + } else { + args.prod = NBPP >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } + /* + * If we are not low on available data blocks, and the + * underlying logical volume manager is a stripe, and + * the file offset is zero then try to allocate data + * blocks on stripe unit boundary. + * NOTE: ap->aeof is only set if the allocation length + * is >= the stripe unit and the allocation offset is + * at the end of file. + */ + if (!ap->low && ap->aeof) { + if (!ap->off) { + args.alignment = mp->m_dalign; + atype = args.type; + isaligned = 1; + /* + * Adjust for alignment + */ + if (blen > args.alignment && blen <= ap->alen) + args.minlen = blen - args.alignment; + args.minalignslop = 0; + } else { + /* + * First try an exact bno allocation. + * If it fails then do a near or start bno + * allocation with alignment turned on. + */ + atype = args.type; + tryagain = 1; + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.alignment = 1; + /* + * Compute the minlen+alignment for the + * next case. Set slop so that the value + * of minlen+alignment+slop doesn't go up + * between the calls. + */ + if (blen > mp->m_dalign && blen <= ap->alen) + nextminlen = blen - mp->m_dalign; + else + nextminlen = args.minlen; + if (nextminlen + mp->m_dalign > args.minlen + 1) + args.minalignslop = + nextminlen + mp->m_dalign - + args.minlen - 1; + else + args.minalignslop = 0; + } + } else { + args.alignment = 1; + args.minalignslop = 0; + } + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.isfl = 0; + args.userdata = ap->userdata; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { + /* + * Exact allocation failed. Now try with alignment + * turned on. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = mp->m_dalign; + args.minlen = nextminlen; + args.minalignslop = 0; + isaligned = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (isaligned && args.fsbno == NULLFSBLOCK) { + /* + * allocation failed, so turn off alignment and + * try again. + */ + args.type = atype; + args.fsbno = ap->rval; + args.alignment = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb && + args.minlen > ap->minlen) { + args.minlen = ap->minlen; + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = ap->rval; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb) { + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = ap->minlen; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + ap->low = 1; + } + if (args.fsbno != NULLFSBLOCK) { + ap->firstblock = ap->rval = args.fsbno; + ASSERT(nullfb || fb_agno == args.agno || + (ap->low && fb_agno < args.agno)); + ap->alen = args.len; + ap->ip->i_d.di_nblocks += args.len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args.len; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT, + (long) args.len); + } else { + ap->rval = NULLFSBLOCK; + ap->alen = 0; + } + } + return 0; +#undef ISVALID +} + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the extent list is already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + xfs_bmbt_block_t *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + xfs_bmbt_ptr_t *pp; /* ptr to block address */ + xfs_bmbt_block_t *rblock;/* root btree block */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1); + ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1); + ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); + mp = ip->i_mount; + pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1))) + return error; +#endif + cbno = INT_GET(*pp, ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, + XFS_BMAP_BTREE_REF))) + return error; + cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); + if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + return 0; +} + +/* + * Called by xfs_bmapi to update extent list structure and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extent list */ + int *logflagsp, /* inode logging flags */ + int whichfork, /* data or attr fork */ + int rsvd) /* OK to allocate reserved blocks */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_del_extent"; +#endif + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + xfs_extnum_t nextents; /* number of extents in list */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + + XFS_STATS_INC(xs_del_exlist); + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(idx >= 0 && idx < nextents); + ASSERT(del->br_blockcount > 0); + ep = &ifp->if_u1.if_extents[idx]; + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = ISNULLSTARTBLOCK(got.br_startblock); + ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + if ((error = xfs_rtfree_extent(ip->i_transp, bno, + (xfs_extlen_t)len))) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + ASSERT(i == 1); + } + da_old = da_new = 0; + } else { + da_old = STARTBLOCKVAL(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); + xfs_bmap_delete_exlist(ip, idx, 1, whichfork); + ifp->if_lastex = idx; + if (delay) + break; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_delete(cur, &i))) + goto done; + ASSERT(i == 1); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "2", ip, idx, + whichfork); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); + xfs_bmbt_set_blockcount(ep, temp); + ifp->if_lastex = idx; + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmap_trace_post_update(fname, "1", ip, idx, + whichfork); + da_new = temp; + break; + } + xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); + if (!cur) { + flags |= XFS_ILOG_FEXT(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_bmbt_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_bmbt_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + ASSERT(i == 1); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + ASSERT(i == 1); + } else + flags |= XFS_ILOG_FEXT(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = NULLSTARTBLOCK((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + NULLSTARTBLOCK((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + NULLSTARTBLOCK((int)temp2); + } + } + } + xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); + xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, + whichfork); + xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork); + ifp->if_lastex = idx + 1; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new), + rsvd); +done: + *logflagsp = flags; + return error; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Remove count entries from the extents array for inode "ip", starting + * at index "idx". Copies the remaining items down over the deleted ones, + * and gives back the excess memory. + */ +STATIC void +xfs_bmap_delete_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting delete index */ + xfs_extnum_t count, /* count of items to delete */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extents in list after */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count; + memmove(&base[idx], &base[idx + count], + (nextents - idx) * sizeof(*base)); + xfs_iext_realloc(ip, -count, whichfork); +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + xfs_bmbt_block_t *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_t *ep; /* extent list pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent list index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* extent list size */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + /* + * Fill in the root. + */ + block = ifp->if_broot; + INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + INT_SET(block->bb_level, ARCH_CONVERT, 1); + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + mp = ip->i_mount; + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + args.tp = tp; + args.mp = mp; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.total = args.minleft = args.alignment = args.mod = args.isfl = + args.minalignslop = 0; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + ablock = XFS_BUF_TO_BMBT_BLOCK(abp); + INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + ablock->bb_level = 0; + INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (ep = ifp->if_u1.if_extents, cnt = i = 0; i < nextents; i++, ep++) { + if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { + arp->l0 = INT_GET(ep->l0, ARCH_CONVERT); + arp->l1 = INT_GET(ep->l1, ARCH_CONVERT); + arp++; cnt++; + } + } + INT_SET(ablock->bb_numrecs, ARCH_CONVERT, cnt); + ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork)); + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + INT_SET(*pp, ARCH_CONVERT, args.fsbno); + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); + return 0; +} + +/* + * Insert new item(s) in the extent list for inode "ip". + * Count new items are inserted at offset idx. + */ +STATIC void +xfs_bmap_insert_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* extent list base */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* extent list size */ + xfs_extnum_t to; /* extent list index */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_realloc(ip, count, whichfork); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + memmove(&base[idx + count], &base[idx], + (nextents - (idx + count)) * sizeof(*base)); + for (to = idx; to < idx + count; to++, new++) + xfs_bmbt_set_all(&base[to], new); +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int flags; /* logging flags returned */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_local_to_extents"; +#endif + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && + whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent list block */ + xfs_bmbt_rec_t *ep; /* extent list pointer */ + + args.tp = tp; + args.mp = ip->i_mount; + ASSERT(ifp->if_flags & XFS_IFINLINE); + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.mod = args.minleft = args.alignment = args.wasdel = + args.isfl = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + if ((error = xfs_alloc_vextent(&args))) + goto done; + /* + * Can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, + ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_realloc(ip, 1, whichfork); + ep = ifp->if_u1.if_extents; + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= XFS_ILOG_FEXT(whichfork); + } else + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_do_search_extents( + xfs_bmbt_rec_t *base, /* base of extent list */ + xfs_extnum_t lastx, /* last extent index used */ + xfs_extnum_t nextents, /* extent list size */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + xfs_bmbt_irec_t got; /* extent list entry, decoded */ + int high; /* high index of binary search */ + int low; /* low index of binary search */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + got.br_startoff = 0xffa5a5a5a5a5a5a5LL; + got.br_blockcount = 0xa55a5a5a5a5a5a5aLL; + got.br_state = XFS_EXT_INVALID; + +#if XFS_BIG_BLKNOS + got.br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + got.br_startblock = 0xffffa5a5; +#endif + + if (lastx != NULLEXTNUM && lastx < nextents) + ep = base + lastx; + else + ep = NULL; + prevp->br_startoff = NULLFILEOFF; + if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) && + bno < got.br_startoff + + (got.br_blockcount = xfs_bmbt_get_blockcount(ep))) + *eofp = 0; + else if (ep && lastx < nextents - 1 && + bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) && + bno < got.br_startoff + + (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) { + lastx++; + ep++; + *eofp = 0; + } else if (nextents == 0) + *eofp = 1; + else if (bno == 0 && + (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) { + ep = base; + lastx = 0; + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + *eofp = 0; + } else { + /* binary search the extents array */ + low = 0; + high = nextents - 1; + while (low <= high) { + XFS_STATS_INC(xs_cmp_exlist); + lastx = (low + high) >> 1; + ep = base + lastx; + got.br_startoff = xfs_bmbt_get_startoff(ep); + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < got.br_startoff) + high = lastx - 1; + else if (bno >= got.br_startoff + got.br_blockcount) + low = lastx + 1; + else { + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + *eofp = 0; + *lastxp = lastx; + *gotp = got; + return ep; + } + } + if (bno >= got.br_startoff + got.br_blockcount) { + lastx++; + if (lastx == nextents) { + *eofp = 1; + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + *prevp = got; + ep = NULL; + } else { + *eofp = 0; + xfs_bmbt_get_all(ep, prevp); + ep++; + got.br_startoff = xfs_bmbt_get_startoff(ep); + got.br_blockcount = xfs_bmbt_get_blockcount(ep); + } + } else { + *eofp = 0; + if (ep > base) + xfs_bmbt_get_all(ep - 1, prevp); + } + } + if (ep) { + got.br_startblock = xfs_bmbt_get_startblock(ep); + got.br_state = xfs_bmbt_get_state(ep); + } + *lastxp = lastx; + *gotp = got; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int whichfork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_extnum_t lastx; /* last extent index used */ + xfs_extnum_t nextents; /* extent list size */ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + int rt; /* realtime flag */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, whichfork); + lastx = ifp->if_lastex; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + + ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp, + lastxp, gotp, prevp); + rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME; + if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) { + cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " + "start_block : %llx start_off : %llx blkcnt : %llx " + "extent-state : %x \n", + (ip->i_mount)->m_fsname,(long long)ip->i_ino, + gotp->br_startblock, gotp->br_startoff, + gotp->br_blockcount,gotp->br_state); + } + return ep; +} + + +#ifdef XFS_BMAP_TRACE +ktrace_t *xfs_bmap_trace_buf; + +/* + * Add a bmap trace buffer entry. Base routine for the others. + */ +STATIC void +xfs_bmap_trace_addentry( + int opcode, /* operation */ + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(ies) */ + xfs_extnum_t cnt, /* count of entries, 1 or 2 */ + xfs_bmbt_rec_t *r1, /* first record */ + xfs_bmbt_rec_t *r2, /* second record or null */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t tr2; + + ASSERT(cnt == 1 || cnt == 2); + ASSERT(r1 != NULL); + if (cnt == 1) { + ASSERT(r2 == NULL); + r2 = &tr2; + memset(&tr2, 0, sizeof(tr2)); + } else + ASSERT(r2 != NULL); + ktrace_enter(xfs_bmap_trace_buf, + (void *)(__psint_t)(opcode | (whichfork << 16)), + (void *)fname, (void *)desc, (void *)ip, + (void *)(__psint_t)idx, + (void *)(__psint_t)cnt, + (void *)(__psunsigned_t)(ip->i_ino >> 32), + (void *)(__psunsigned_t)(unsigned)ip->i_ino, + (void *)(__psunsigned_t)(r1->l0 >> 32), + (void *)(__psunsigned_t)(unsigned)(r1->l0), + (void *)(__psunsigned_t)(r1->l1 >> 32), + (void *)(__psunsigned_t)(unsigned)(r1->l1), + (void *)(__psunsigned_t)(r2->l0 >> 32), + (void *)(__psunsigned_t)(unsigned)(r2->l0), + (void *)(__psunsigned_t)(r2->l1 >> 32), + (void *)(__psunsigned_t)(unsigned)(r2->l1) + ); + ASSERT(ip->i_xtrace); + ktrace_enter(ip->i_xtrace, + (void *)(__psint_t)(opcode | (whichfork << 16)), + (void *)fname, (void *)desc, (void *)ip, + (void *)(__psint_t)idx, + (void *)(__psint_t)cnt, + (void *)(__psunsigned_t)(ip->i_ino >> 32), + (void *)(__psunsigned_t)(unsigned)ip->i_ino, + (void *)(__psunsigned_t)(r1->l0 >> 32), + (void *)(__psunsigned_t)(unsigned)(r1->l0), + (void *)(__psunsigned_t)(r1->l1 >> 32), + (void *)(__psunsigned_t)(unsigned)(r1->l1), + (void *)(__psunsigned_t)(r2->l0 >> 32), + (void *)(__psunsigned_t)(unsigned)(r2->l0), + (void *)(__psunsigned_t)(r2->l1 >> 32), + (void *)(__psunsigned_t)(unsigned)(r2->l1) + ); +} + +/* + * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist. + */ +STATIC void +xfs_bmap_trace_delete( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) deleted */ + xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx, + cnt, &ifp->if_u1.if_extents[idx], + cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL, + whichfork); +} + +/* + * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or + * reading in the extents list from the disk (in the btree). + */ +STATIC void +xfs_bmap_trace_insert( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry(entries) inserted */ + xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ + xfs_bmbt_irec_t *r1, /* inserted record 1 */ + xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t tr1; /* compressed record 1 */ + xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */ + + xfs_bmbt_set_all(&tr1, r1); + if (cnt == 2) { + ASSERT(r2 != NULL); + xfs_bmbt_set_all(&tr2, r2); + } else { + ASSERT(cnt == 1); + ASSERT(r2 == NULL); + } + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx, + cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork); +} + +/* + * Add bmap trace entry after updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_post_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry updated */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx, + 1, &ifp->if_u1.if_extents[idx], NULL, whichfork); +} + +/* + * Add bmap trace entry prior to updating an extent list entry in place. + */ +STATIC void +xfs_bmap_trace_pre_update( + char *fname, /* function name */ + char *desc, /* operation description */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index of entry to be updated */ + int whichfork) /* data or attr fork */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1, + &ifp->if_u1.if_extents[idx], NULL, whichfork); +} +#endif /* XFS_BMAP_TRACE */ + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +#if defined(XFS_RW_TRACE) +STATIC void +xfs_bunmap_trace( + xfs_inode_t *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + inst_t *ra) +{ + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(__psint_t)XFS_BUNMAPI, + (void *)ip, + (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff), + (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff), + (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff), + (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff), + (void *)(__psint_t)len, + (void *)(__psint_t)flags, + (void *)(unsigned long)current_cpu(), + (void *)ra, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0); +} +#endif + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int rsvd) /* OK to allocated reserved blocks in trans */ +{ + int blks; /* space reservation */ + int committed; /* xaction was committed */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent list */ + int logflags; /* logging flags */ + xfs_mount_t *mp; /* mount structure */ + unsigned long s; /* spinlock spl value */ + xfs_trans_t *tp; /* transaction pointer */ + + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + if (XFS_IFORK_Q(ip)) + return 0; + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + VN_HOLD(XFS_ITOV(ip)); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = mp->m_attroffset >> 3; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + XFS_BMAP_INIT(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { + s = XFS_SB_LOCK(mp); + if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { + XFS_SB_VERSION_ADDATTR(&mp->m_sb); + XFS_SB_UNLOCK(mp, s); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM); + } else + XFS_SB_UNLOCK(mp, s); + } + if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) + goto error2; + error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +error2: + xfs_bmap_cancel(&flist); +error1: + ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + return error; +} + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +/* ARGSUSED */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!ISNULLSTARTBLOCK(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + */ + maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM; + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + sz = (whichfork == XFS_DATA_FORK) ? + mp->m_attroffset : + mp->m_sb.sb_inodesize - mp->m_attroffset; + maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. This is to allow the caller to make the first + * transaction a synchronous one so that the pointers to the data being + * broken in this transaction will be permanent before the data is actually + * freed. This is necessary to prevent blocks from being reallocated + * and written to before the free and reallocation are actually permanent. + * We do not just make the first transaction synchronous here, because + * there are more efficient ways to gain the same protection in some cases + * (see the file truncation code). + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +/*ARGSUSED*/ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + xfs_fsblock_t firstblock, /* controlled ag for allocs */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent list item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0, NULL); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) { + return error; + } + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + XFS_CORRUPT_INCORE : + XFS_METADATA_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + base = &ifp->if_u1.if_extents[0]; + for (lastaddr = 0, max = lowest, ep = base; + ep < &base[nextents]; + ep++) { + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_offset( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent array */ + xfs_bmbt_rec_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (!nextents) { + *last_block = 0; + return 0; + } + base = &ifp->if_u1.if_extents[0]; + ASSERT(base != NULL); + ep = &base[nextents - 1]; + *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = ifp->if_u1.if_extents; + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_bmap_read_extents"; +#endif + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + xfs_bmbt_rec_t *trp; /* target record pointer */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, level), + error0); + if (level == 0) + break; + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, + 1, mp->m_bmap_dmxr[1]); + XFS_WANT_CORRUPTED_GOTO( + XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), + error0); + bno = INT_GET(*pp, ARCH_CONVERT); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(*trp); + trp = ifp->if_u1.if_extents; + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent list. + */ + for (;;) { + xfs_bmbt_rec_t *frp, *temp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, (btree extents). Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino); + XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1); + /* + * Copy records into the extent list. + */ + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, + block, 1, mp->m_bmap_dmxr[0]); + temp = trp; + for (j = 0; j < num_recs; j++, frp++, trp++) { + trp->l0 = INT_GET(frp->l0, ARCH_CONVERT); + trp->l1 = INT_GET(frp->l1, ARCH_CONVERT); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(temp, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + i += num_recs; + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp)); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_bmap_trace_exlist(fname, ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + +#ifdef XFS_BMAP_TRACE +/* + * Add bmap trace insert entries for all the contents of the extent list. + */ +void +xfs_bmap_trace_exlist( + char *fname, /* function name */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extent list */ + xfs_bmbt_rec_t *ep; /* current entry in extent list */ + xfs_extnum_t idx; /* extent list entry number */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t s; /* extent list record */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base)); + base = ifp->if_u1.if_extents; + for (idx = 0, ep = base; idx < cnt; idx++, ep++) { + xfs_bmbt_get_all(ep, &s); + xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, + whichfork); + } +} +#endif + +#ifdef DEBUG +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} +#endif /* DEBUG */ + + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + xfs_bmbt_irec_t *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist) /* i/o: list extents to free */ +{ + xfs_fsblock_t abno; /* allocated block number */ + xfs_extlen_t alen; /* allocated extent length */ + xfs_fileoff_t aoff; /* allocated file offset */ + xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ + char contig; /* allocation must be one extent */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + char delay; /* this request is for delayed alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* we've hit the end of extent list */ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + int error; /* error return */ + char exact; /* don't do all of wasdelayed extent */ + xfs_bmbt_irec_t got; /* current extent list record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extlen_t indlen; /* indirect blocks length */ + char inhole; /* current location is hole in file */ + xfs_extnum_t lastx; /* last useful extent number */ + int logflags; /* flags for transaction logging */ + xfs_extlen_t minleft; /* min blocks left after allocation */ + xfs_extlen_t minlen; /* min allocation size */ + xfs_mount_t *mp; /* xfs mount structure */ + int n; /* current extent index */ + int nallocs; /* number of extents alloc\'d */ + xfs_extnum_t nextents; /* number of extents in file */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_bmbt_irec_t prev; /* previous extent list record */ + char stateless; /* ignore state flag set */ + int tmp_logflags; /* temp flags holder */ + char trim; /* output trimmed to match range */ + char userdata; /* allocating non-metadata */ + char wasdelay; /* old extent was delayed */ + int whichfork; /* data or attr fork */ + char wr; /* this is a write request */ + char rsvd; /* OK to allocate reserved blocks */ +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + xfs_bmbt_irec_t *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + mp = ip->i_mount; + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) + XFS_STATS_INC(xs_blk_mapw); + else + XFS_STATS_INC(xs_blk_mapr); + delay = (flags & XFS_BMAPI_DELAY) != 0; + trim = (flags & XFS_BMAPI_ENTIRE) == 0; + userdata = (flags & XFS_BMAPI_METADATA) == 0; + exact = (flags & XFS_BMAPI_EXACT) != 0; + rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + contig = (flags & XFS_BMAPI_CONTIG) != 0; + /* + * stateless is used to combine extents which + * differ only due to the state of the extents. + * This technique is used from xfs_getbmap() + * when the caller does not wish to see the + * separation (which is the default). + * + * This technique is also used when writing a + * buffer which has been partially written, + * (usually by being flushed during a chunkread), + * to ensure one write takes place. This also + * prevents a change in the xfs inode extents at + * this time, intentionally. This change occurs + * on completion of the write operation, in + * xfs_strat_comp(), where the xfs_bmapi() call + * is transactioned, and the extents combined. + */ + stateless = (flags & XFS_BMAPI_IGSTATE) != 0; + if (stateless && wr) /* if writing unwritten space, no */ + wr = 0; /* allocations are allowed */ + ASSERT(wr || !delay); + logflags = 0; + nallocs = 0; + cur = NULL; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + ASSERT(wr && tp); + if ((error = xfs_bmap_local_to_extents(tp, ip, + firstblock, total, &logflags, whichfork))) + goto error0; + } + if (wr && *firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1; + else + minleft = 1; + } else + minleft = 0; + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + goto error0; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + n = 0; + end = bno + len; + obno = bno; + bma.ip = NULL; + while (bno < end && n < *nmap) { + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof && !wr) + got.br_startoff = end; + inhole = eof || got.br_startoff > bno; + wasdelay = wr && !inhole && !delay && + ISNULLSTARTBLOCK(got.br_startblock); + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (wr && (inhole || wasdelay)) { + /* + * For the wasdelay case, we could also just + * allocate the stuff asked for in this bmap call + * but that wouldn't be as good. + */ + if (wasdelay && !exact) { + alen = (xfs_extlen_t)got.br_blockcount; + aoff = got.br_startoff; + if (lastx != NULLEXTNUM && lastx) { + ep = &ifp->if_u1.if_extents[lastx - 1]; + xfs_bmbt_get_all(ep, &prev); + } + } else if (wasdelay) { + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(len, + (got.br_startoff + + got.br_blockcount) - bno); + aoff = bno; + } else { + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = (xfs_extlen_t) + XFS_FILBLKS_MIN(alen, + got.br_startoff - bno); + aoff = bno; + } + minlen = contig ? alen : 1; + if (delay) { + indlen = (xfs_extlen_t) + xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + /* + * Make a transaction-less quota reservation for + * delayed allocation blocks. This number gets + * adjusted later. + * We return EDQUOT if we haven't allocated + * blks already inside this loop; + */ + if (XFS_TRANS_RESERVE_BLKQUOTA( + mp, NULL, ip, (long)alen)) { + if (n == 0) { + *nmap = 0; + ASSERT(cur == NULL); + return XFS_ERROR(EDQUOT); + } + break; + } + + /* + * Split changing sb for alen and indlen since + * they could be coming from different places. + */ + if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_extlen_t extsz; + xfs_extlen_t ralen; + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + ralen = roundup(alen, extsz); + ralen = ralen / mp->m_sb.sb_rextsize; + if (xfs_mod_incore_sb(mp, + XFS_SBS_FREXTENTS, + -(ralen), rsvd)) { + if (XFS_IS_QUOTA_ON(ip->i_mount)) + XFS_TRANS_UNRESERVE_BLKQUOTA( + mp, NULL, ip, + (long)alen); + break; + } + } else { + if (xfs_mod_incore_sb(mp, + XFS_SBS_FDBLOCKS, + -(alen), rsvd)) { + if (XFS_IS_QUOTA_ON(ip->i_mount)) + XFS_TRANS_UNRESERVE_BLKQUOTA( + mp, NULL, ip, + (long)alen); + break; + } + } + + if (xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, + -(indlen), rsvd)) { + XFS_TRANS_UNRESERVE_BLKQUOTA( + mp, NULL, ip, (long)alen); + break; + } + ip->i_delayed_blks += alen; + abno = NULLSTARTBLOCK(indlen); + } else { + /* + * If first time, allocate and fill in + * once-only bma fields. + */ + if (bma.ip == NULL) { + bma.tp = tp; + bma.ip = ip; + bma.prevp = &prev; + bma.gotp = &got; + bma.total = total; + bma.userdata = 0; + } + /* Indicate if this is the first user data + * in the file, or just any user data. + */ + if (userdata) { + bma.userdata = (aoff == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : + XFS_ALLOC_USERDATA; + } + /* + * Fill in changeable bma fields. + */ + bma.eof = eof; + bma.firstblock = *firstblock; + bma.alen = alen; + bma.off = aoff; + bma.wasdel = wasdelay; + bma.minlen = minlen; + bma.low = flist->xbf_low; + bma.minleft = minleft; + /* + * Only want to do the alignment at the + * eof if it is userdata and allocation length + * is larger than a stripe unit. + */ + if (mp->m_dalign && alen >= mp->m_dalign && + userdata && whichfork == XFS_DATA_FORK) { + if ((error = xfs_bmap_isaeof(ip, aoff, + whichfork, &bma.aeof))) + goto error0; + } else + bma.aeof = 0; + /* + * Call allocator. + */ + if ((error = xfs_bmap_alloc(&bma))) + goto error0; + /* + * Copy out result fields. + */ + abno = bma.rval; + if ((flist->xbf_low = bma.low)) + minleft = 0; + alen = bma.alen; + aoff = bma.off; + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, bma.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, bma.firstblock))); + *firstblock = bma.firstblock; + if (cur) + cur->bc_private.b.firstblock = + *firstblock; + if (abno == NULLFSBLOCK) + break; + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_btree_init_cursor(mp, + tp, NULL, 0, XFS_BTNUM_BMAP, + ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + nallocs++; + } + if (cur) + cur->bc_private.b.flags = + wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; + got.br_startoff = aoff; + got.br_startblock = abno; + got.br_blockcount = alen; + got.br_state = XFS_EXT_NORM; /* assume normal */ + /* + * Determine state of extent, and the filesystem. + * A wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + */ + if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) + got.br_state = XFS_EXT_UNWRITTEN; + } + error = xfs_bmap_add_extent(ip, lastx, &cur, &got, + firstblock, flist, &tmp_logflags, whichfork, + rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + lastx = ifp->if_lastex; + ep = &ifp->if_u1.if_extents[lastx]; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= aoff); + ASSERT(got.br_startoff + got.br_blockcount >= + aoff + alen); +#ifdef DEBUG + if (delay) { + ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); + } + ASSERT(got.br_state == XFS_EXT_NORM || + got.br_state == XFS_EXT_UNWRITTEN); +#endif + /* + * Fall down into the found allocated space case. + */ + } else if (inhole) { + /* + * Reading in a hole. + */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + /* + * Then deal with the allocated space we found. + */ + ASSERT(ep != NULL); + if (trim && (got.br_startoff + got.br_blockcount > obno)) { + if (obno > bno) + bno = obno; + ASSERT((bno >= obno) || (n == 0)); + ASSERT(bno < end); + mval->br_startoff = bno; + if (ISNULLSTARTBLOCK(got.br_startblock)) { + ASSERT(!wr || delay); + mval->br_startblock = DELAYSTARTBLOCK; + } else + mval->br_startblock = + got.br_startblock + + (bno - got.br_startoff); + /* + * Return the minimum of what we got and what we + * asked for for the length. We can use the len + * variable here because it is modified below + * and we could have been there before coming + * here if the first part of the allocation + * didn't overlap what was asked for. + */ + mval->br_blockcount = + XFS_FILBLKS_MIN(end - bno, got.br_blockcount - + (bno - got.br_startoff)); + mval->br_state = got.br_state; + ASSERT(mval->br_blockcount <= len); + } else { + *mval = got; + if (ISNULLSTARTBLOCK(mval->br_startblock)) { + ASSERT(!wr || delay); + mval->br_startblock = DELAYSTARTBLOCK; + } + } + + /* + * Check if writing previously allocated but + * unwritten extents. + */ + if (wr && mval->br_state == XFS_EXT_UNWRITTEN && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !cur) { + cur = xfs_btree_init_cursor(mp, + tp, NULL, 0, XFS_BTNUM_BMAP, + ip, whichfork); + cur->bc_private.b.firstblock = + *firstblock; + cur->bc_private.b.flist = flist; + } + mval->br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent(ip, lastx, &cur, mval, + firstblock, flist, &tmp_logflags, whichfork, + rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + lastx = ifp->if_lastex; + ep = &ifp->if_u1.if_extents[lastx]; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmbt_get_all(ep, &got); + /* + * We may have combined previously unwritten + * space with written space, so generate + * another request. + */ + if (mval->br_blockcount < len) + continue; + } + + ASSERT(!trim || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT(!trim || (mval->br_blockcount <= len) || + (mval->br_startoff < obno)); + bno = mval->br_startoff + mval->br_blockcount; + len = end - bno; + if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == + mval[-1].br_startblock + mval[-1].br_blockcount && + (stateless || mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + n++; + } + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || nallocs >= *nmap) + break; + /* + * Else go on to the next record. + */ + ep++; + lastx++; + if (lastx >= nextents) { + eof = 1; + prev = got; + } else + xfs_bmbt_get_all(ep, &got); + } + ifp->if_lastex = lastx; + *nmap = n; + /* + * Transform from btree to extents, give it cur. + */ + if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(wr && cur); + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + error = 0; + +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent list if we've converted to btree format. + */ + if ((logflags & XFS_ILOG_FEXT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_FEXT(whichfork); + else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_FBROOT(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (logflags) { + ASSERT(tp && wr); + xfs_trans_log_inode(tp, ip, logflags); + } + if (cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + cur->bc_private.b.firstblock))); + *firstblock = cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* + * Map file blocks to filesystem blocks, simple version. + * One block (extent) only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno) /* starting file offs. mapped */ +{ + int eof; /* we've hit the end of extent list */ + int error; /* error return */ + xfs_bmbt_irec_t got; /* current extent list record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last useful extent number */ + xfs_bmbt_irec_t prev; /* previous extent list record */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return XFS_ERROR(EIO); + XFS_STATS_INC(xs_blk_mapr); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + /* + * Reading past eof, act as though there's a hole + * up to end. + */ + if (eof || got.br_startoff > bno) { + *fsb = NULLFSBLOCK; + return 0; + } + ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(bno < got.br_startoff + got.br_blockcount); + *fsb = got.br_startblock + (bno - got.br_startoff); + ifp->if_lastex = lastx; + return 0; +} + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + xfs_trans_t *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* misc flags */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done) /* set if not done yet */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_irec_t del; /* extent being deleted */ + int eof; /* is deleting at eof */ + xfs_bmbt_rec_t *ep; /* extent list entry pointer */ + int error; /* error return value */ + xfs_extnum_t extno; /* extent number in list */ + xfs_bmbt_irec_t got; /* current extent list entry */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int isrt; /* freeing in rt area */ + xfs_extnum_t lastx; /* last extent index used */ + int logflags; /* transaction logging flags */ + xfs_extlen_t mod; /* rt extent offset */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_bmbt_irec_t prev; /* previous extent list entry */ + xfs_fileoff_t start; /* first file offset deleted */ + int tmp_logflags; /* partial logging flags */ + int wasdel; /* was a delayed alloc extent */ + int whichfork; /* data or attribute fork */ + int rsvd; /* OK to allocate reserved blocks */ + xfs_fsblock_t sum; + + xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + ASSERT(len > 0); + ASSERT(nexts >= 0); + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *done = 1; + return 0; + } + XFS_STATS_INC(xs_blk_unmap); + isrt = (whichfork == XFS_DATA_FORK) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); + start = bno; + bno = start + len - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + /* + * Check to see if the given block number is past the end of the + * file, back up to the last block if so... + */ + if (eof) { + ep = &ifp->if_u1.if_extents[--lastx]; + xfs_bmbt_get_all(ep, &got); + bno = got.br_startoff + got.br_blockcount - 1; + } + logflags = 0; + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, + whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + (nexts == 0 || extno < nexts)) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. + */ + if (got.br_startoff > bno) { + if (--lastx < 0) + break; + ep--; + xfs_bmbt_get_all(ep, &got); + } + /* + * Is the last block of this extent before the range + * we're supposed to delete? If so, we're done. + */ + bno = XFS_FILEOFF_MIN(bno, + got.br_startoff + got.br_blockcount - 1); + if (bno < start) + break; + /* + * Then deal with the (possibly delayed) allocated space + * we found. + */ + ASSERT(ep != NULL); + del = got; + wasdel = ISNULLSTARTBLOCK(del.br_startblock); + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; + if (!wasdel) + del.br_startblock += start - got.br_startoff; + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent not lined up at the end. + * The extent could have been split into written + * and unwritten pieces, or we could just be + * unmapping part of it. But we can't really + * get rid of part of a realtime extent. + */ + if (del.br_state == XFS_EXT_UNWRITTEN || + !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + /* + * This piece is unwritten, or we're not + * using unwritten extents. Skip over it. + */ + ASSERT(bno >= mod); + bno -= mod > del.br_blockcount ? + del.br_blockcount : mod; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(--ep, &got); + } + continue; + } + /* + * It's written, turn it unwritten. + * This is better than zeroing it. + */ + ASSERT(del.br_state == XFS_EXT_NORM); + ASSERT(xfs_trans_get_block_res(tp) > 0); + /* + * If this spans a realtime extent boundary, + * chop it back to the start of the one we end at. + */ + if (del.br_blockcount > mod) { + del.br_startoff += del.br_blockcount - mod; + del.br_startblock += del.br_blockcount - mod; + del.br_blockcount = mod; + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx, &cur, &del, + firstblock, flist, &logflags, XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } + if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent is lined up at the end but not + * at the front. We'll get rid of full extents if + * we can. + */ + mod = mp->m_sb.sb_rextsize - mod; + if (del.br_blockcount > mod) { + del.br_blockcount -= mod; + del.br_startoff += mod; + del.br_startblock += mod; + } else if ((del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + xfs_trans_get_block_res(tp) == 0)) || + !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + /* + * Can't make it unwritten. There isn't + * a full extent here so just skip it. + */ + ASSERT(bno >= del.br_blockcount); + bno -= del.br_blockcount; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(--ep, &got); + } + continue; + } else if (del.br_state == XFS_EXT_UNWRITTEN) { + /* + * This one is already unwritten. + * It must have a written left neighbor. + * Unwrite the killed part of that one and + * try again. + */ + ASSERT(lastx > 0); + xfs_bmbt_get_all(ep - 1, &prev); + ASSERT(prev.br_state == XFS_EXT_NORM); + ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); + ASSERT(del.br_startblock == + prev.br_startblock + prev.br_blockcount); + if (prev.br_startoff < start) { + mod = start - prev.br_startoff; + prev.br_blockcount -= mod; + prev.br_startblock += mod; + prev.br_startoff = start; + } + prev.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx - 1, &cur, + &prev, firstblock, flist, &logflags, + XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } else { + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent(ip, lastx, &cur, + &del, firstblock, flist, &logflags, + XFS_DATA_FORK, 0); + if (error) + goto error0; + goto nodelete; + } + } + if (wasdel) { + ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, + (int)del.br_blockcount, rsvd); + /* Unreserve our quota space */ + XFS_TRANS_RESERVE_QUOTA_NBLKS( + mp, NULL, ip, -((long)del.br_blockcount), 0, + isrt ? XFS_QMOPT_RES_RTBLKS : + XFS_QMOPT_RES_REGBLKS); + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && xfs_trans_get_block_res(tp) == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, + &tmp_logflags, whichfork, rsvd); + logflags |= tmp_logflags; + if (error) + goto error0; + bno = del.br_startoff - 1; +nodelete: + lastx = ifp->if_lastex; + /* + * If not done go on to the next (previous) record. + * Reset ep in case the extents array was re-alloced. + */ + ep = &ifp->if_u1.if_extents[lastx]; + if (bno != (xfs_fileoff_t)-1 && bno >= start) { + if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || + xfs_bmbt_get_startoff(ep) > bno) { + lastx--; + ep--; + } + if (lastx >= 0) + xfs_bmbt_get_all(ep, &got); + extno++; + } + } + ifp->if_lastex = lastx; + *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Convert to a btree if necessary. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from btree to extents, give it cur + */ + else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from extents to local? + */ + ASSERT(ifp->if_ext_max == + XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent list if we've converted to btree format. + */ + if ((logflags & XFS_ILOG_FEXT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_FEXT(whichfork); + else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_FBROOT(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Fcntl interface to xfs_bmapi. + */ +int /* error code */ +xfs_getbmap( + bhv_desc_t *bdp, /* XFS behavior descriptor*/ + struct getbmap *bmv, /* user bmap structure */ + void __user *ap, /* pointer to user's array */ + int interface) /* interface flags */ +{ + __int64_t bmvend; /* last block requested */ + int error; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + xfs_inode_t *ip; /* xfs incore inode pointer */ + vnode_t *vp; /* corresponding vnode */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmap out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int sh_unwritten; /* true, if unwritten */ + /* extents listed separately */ + int bmapi_flags; /* flags for xfs_bmapi */ + __int32_t oflags; /* getbmapx bmv_oflags field */ + + vp = BHV_TO_VNODE(bdp); + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + sh_unwritten = (interface & BMV_IF_PREALLOC) != 0; + + /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not + * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ + * bit is set for the file, generate a read event in order + * that the DMAPI application may do its thing before we return + * the extents. Usually this means restoring user file data to + * regions of the file that look like holes. + * + * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify + * BMV_IF_NO_DMAPI_READ so that read events are generated. + * If this were not true, callers of ioctl( XFS_IOC_GETBMAP ) + * could misinterpret holes in a DMAPI file as true holes, + * when in fact they may represent offline user data. + */ + if ( (interface & BMV_IF_NO_DMAPI_READ) == 0 + && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) + && whichfork == XFS_DATA_FORK) { + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); + if (error) + return XFS_ERROR(error); + } + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + if (whichfork == XFS_DATA_FORK) { + if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) { + prealloced = 1; + fixlen = XFS_MAXIOFFSET(mp); + } else { + prealloced = 0; + fixlen = ip->i_d.di_size; + } + } else { + prealloced = 0; + fixlen = 1LL << 32; + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), + (__int64_t)0); + } else if (bmv->bmv_length < 0) + return XFS_ERROR(EINVAL); + if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } + nex = bmv->bmv_count - 1; + if (nex <= 0) + return XFS_ERROR(EINVAL); + bmvend = bmv->bmv_offset + bmv->bmv_length; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) { + /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); + } + + ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); + + lock = xfs_ilock_map_shared(ip); + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | + ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE); + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) { + error = 0; + goto unlock_and_return; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + bmapi_flags, NULL, 0, map, &nmap, NULL); + if (error) + goto unlock_and_return; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + nexleft--; + oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ? + BMV_OF_PREALLOC : 0; + out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); + out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + if (prealloced && + map[i].br_startblock == HOLESTARTBLOCK && + out.bmv_offset + out.bmv_length == bmvend) { + /* + * came to hole at end of file + */ + goto unlock_and_return; + } else { + out.bmv_block = + (map[i].br_startblock == HOLESTARTBLOCK) ? + -1 : + XFS_FSB_TO_DB(ip, map[i].br_startblock); + + /* return either getbmap/getbmapx structure. */ + if (interface & BMV_IF_EXTENDED) { + struct getbmapx outx; + + GETBMAP_CONVERT(out,outx); + outx.bmv_oflags = oflags; + outx.bmv_unused1 = outx.bmv_unused2 = 0; + if (copy_to_user(ap, &outx, + sizeof(outx))) { + error = XFS_ERROR(EFAULT); + goto unlock_and_return; + } + } else { + if (copy_to_user(ap, &out, + sizeof(out))) { + error = XFS_ERROR(EFAULT); + goto unlock_and_return; + } + } + bmv->bmv_offset = + out.bmv_offset + out.bmv_length; + bmv->bmv_length = MAX((__int64_t)0, + (__int64_t)(bmvend - bmv->bmv_offset)); + bmv->bmv_entries++; + ap = (interface & BMV_IF_EXTENDED) ? + (void __user *) + ((struct getbmapx __user *)ap + 1) : + (void __user *) + ((struct getbmap __user *)ap + 1); + } + } + } while (nmap && nexleft && bmv->bmv_length); + +unlock_and_return: + xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + kmem_free(map, subnex * sizeof(*map)); + + return error; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +int /* error */ +xfs_bmap_isaeof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t off, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + char *aeof) /* return value */ +{ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_bmbt_irec_t s; /* expanded extent list entry */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *aeof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = &ifp->if_u1.if_extents[nextents - 1]; + xfs_bmbt_get_all(lastrec, &s); + /* + * Check we are allocating in the last extent (for delayed allocations) + * or past the last extent for non-delayed allocations. + */ + *aeof = (off >= s.br_startoff && + off < s.br_startoff + s.br_blockcount && + ISNULLSTARTBLOCK(s.br_startblock)) || + off >= s.br_startoff + s.br_blockcount; + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. + */ +int /* error */ +xfs_bmap_eof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t endoff, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + int *eof) /* result value */ +{ + xfs_fsblock_t blockcount; /* extent block count */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_t *lastrec; /* extent list entry pointer */ + xfs_extnum_t nextents; /* size of extent list */ + xfs_fileoff_t startoff; /* extent starting file offset */ + + ASSERT(whichfork == XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(NULL, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *eof = 1; + return 0; + } + /* + * Go to the last extent + */ + lastrec = &ifp->if_u1.if_extents[nextents - 1]; + startoff = xfs_bmbt_get_startoff(lastrec); + blockcount = xfs_bmbt_get_blockcount(lastrec); + *eof = endoff >= startoff + blockcount; + return 0; +} + +#ifdef DEBUG +/* + * Check that the extents list for the inode ip is in the right order. + */ +STATIC void +xfs_bmap_check_extents( + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_t *base; /* base of extents list */ + xfs_bmbt_rec_t *ep; /* current extent entry */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extents in list */ + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + base = ifp->if_u1.if_extents; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (ep = base; ep < &base[nextents - 1]; ep++) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, + (void *)(ep + 1)); + } +} + +STATIC +xfs_buf_t * +xfs_bmap_get_bp( + xfs_btree_cur_t *cur, + xfs_fsblock_t bno) +{ + int i; + xfs_buf_t *bp; + + if (!cur) + return(NULL); + + bp = NULL; + for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + bp = cur->bc_bufs[i]; + if (!bp) break; + if (XFS_BUF_ADDR(bp) == bno) + break; /* Found it */ + } + if (i == XFS_BTREE_MAXLEVELS) + bp = NULL; + + if (!bp) { /* Chase down all the log items to see if the bp is there */ + xfs_log_item_chunk_t *licp; + xfs_trans_t *tp; + + tp = cur->bc_tp; + licp = &tp->t_items; + while (!bp && licp != NULL) { + if (XFS_LIC_ARE_ALL_FREE(licp)) { + licp = licp->lic_next; + continue; + } + for (i = 0; i < licp->lic_unused; i++) { + xfs_log_item_desc_t *lidp; + xfs_log_item_t *lip; + xfs_buf_log_item_t *bip; + xfs_buf_t *lbp; + + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lidp = XFS_LIC_SLOT(licp, i); + lip = lidp->lid_item; + if (lip->li_type != XFS_LI_BUF) + continue; + + bip = (xfs_buf_log_item_t *)lip; + lbp = bip->bli_buf; + + if (XFS_BUF_ADDR(lbp) == bno) { + bp = lbp; + break; /* Found it */ + } + } + licp = licp->lic_next; + } + } + return(bp); +} + +void +xfs_check_block( + xfs_bmbt_block_t *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + + prevp = NULL; + for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) { + dmxr = mp->m_bmap_dmxr[0]; + + if (root) { + keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); + } else { + keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, i, dmxr); + } + + if (prevp) { + xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + + if (root) { + pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); + } else { + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, i, dmxr); + } + for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) { + if (root) { + thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); + } else { + thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, j, dmxr); + } + if (INT_GET(*thispa, ARCH_CONVERT) == + INT_GET(*pp, ARCH_CONVERT)) { + cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", + __FUNCTION__, j, i, + INT_GET(*thispa, ARCH_CONVERT)); + panic("%s: ptrs are equal in node\n", + __FUNCTION__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep, *lastp; /* extent pointers in block entry */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + XFS_BMAP_SANITY_CHECK(mp, block, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, + 1, mp->m_bmap_dmxr[1]); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0); + bno = INT_GET(*pp, ARCH_CONVERT); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + lastp = NULL; + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, + block, 1, mp->m_bmap_dmxr[0]); + + for (ep = frp;ep < frp + (num_recs - 1); ep++) { + if (lastp) { + xfs_btree_check_rec(XFS_BTNUM_BMAP, + (void *)lastp, (void *)ep); + } + xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, + (void *)(ep + 1)); + } + lastp = frp + num_recs - 1; /* For the next iteration */ + + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (bp) { + bp_release = 0; + } else { + bp_release = 1; + } + if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + goto error_norelse; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + cmn_err(CE_WARN, "%s: at error0", __FUNCTION__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", + i, __FUNCTION__); + panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__); + return; +} +#endif + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + xfs_bmbt_block_t *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_bmbt_ptr_t *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + if (unlikely(xfs_bmap_count_leaves(ifp->if_u1.if_extents, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0); + level = INT_GET(block->bb_level, ARCH_CONVERT); + pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); + bno = INT_GET(*pp, ARCH_CONVERT); + + if (unlikely(xfs_bmap_count_tree(mp, tp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + xfs_bmbt_ptr_t *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + xfs_bmbt_block_t *block, *nextblock; + int numrecs; + xfs_bmbt_rec_t *frp; + + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + + if (--level) { + /* Not at node above leafs, count this level of nodes */ + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + while (nextbno != NULLFSBLOCK) { + if ((error = xfs_btree_read_bufl(mp, tp, nextbno, + 0, &nbp, XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp); + nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + bno = INT_GET(*pp, ARCH_CONVERT); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, + xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]); + if (unlikely(xfs_bmap_count_leaves(frp, numrecs, count) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(2)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF))) + return error; + *count += 1; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } + } + return 0; +} + +/* + * Count leaf blocks given a pointer to an extent list. + */ +int +xfs_bmap_count_leaves( + xfs_bmbt_rec_t *frp, + int numrecs, + int *count) +{ + int b; + + for ( b = 1; b <= numrecs; b++, frp++) + *count += xfs_bmbt_disk_get_blockcount(frp); + return 0; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h new file mode 100644 index 00000000000..f1bc22fb26a --- /dev/null +++ b/fs/xfs/xfs_bmap.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BMAP_H__ +#define __XFS_BMAP_H__ + +struct getbmap; +struct xfs_bmbt_irec; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +typedef struct xfs_bmap_free_item +{ + xfs_fsblock_t xbfi_startblock;/* starting fs block number */ + xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ + struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ +} xfs_bmap_free_item_t; + +/* + * Header for free extent list. + */ +typedef struct xfs_bmap_free +{ + xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + int xbf_count; /* count of items on list */ + int xbf_low; /* kludge: alloc in low mode */ +} xfs_bmap_free_t; + +#define XFS_BMAP_MAX_NMAP 4 + +/* + * Flags for xfs_bmapi + */ +#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ +#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ +#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ +#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ +#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ +#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ +#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ +#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ + /* combine contig. space */ +#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG) +int xfs_bmapi_aflag(int w); +#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) +#else +#define XFS_BMAPI_AFLAG(w) ((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0) +#endif + +/* + * Special values for xfs_bmbt_irec_t br_startblock field. + */ +#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) +#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT) +void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp); +#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp) +#else +#define XFS_BMAP_INIT(flp,fbp) \ + ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ + (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK) +#endif + +/* + * Argument structure for xfs_bmap_alloc. + */ +typedef struct xfs_bmalloca { + xfs_fsblock_t firstblock; /* i/o first block allocated */ + xfs_fsblock_t rval; /* starting block of new extent */ + xfs_fileoff_t off; /* offset in file filling in */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec *prevp; /* extent before the new one */ + struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ + xfs_extlen_t alen; /* i/o length asked/allocated */ + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* mininum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + char eof; /* set if allocating past last extent */ + char wasdel; /* replacing a delayed allocation */ + char userdata;/* set if is user data */ + char low; /* low on space, using seq'l ags */ + char aeof; /* allocated space at eof */ +} xfs_bmalloca_t; + +#ifdef __KERNEL__ + +#if defined(XFS_BMAP_TRACE) +/* + * Trace operations for bmap extent tracing + */ +#define XFS_BMAP_KTRACE_DELETE 1 +#define XFS_BMAP_KTRACE_INSERT 2 +#define XFS_BMAP_KTRACE_PRE_UP 3 +#define XFS_BMAP_KTRACE_POST_UP 4 + +#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */ +#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */ +extern ktrace_t *xfs_bmap_trace_buf; + +/* + * Add bmap trace insert entries for all the contents of the extent list. + */ +void +xfs_bmap_trace_exlist( + char *fname, /* function name */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in list */ + int whichfork); /* data or attr fork */ +#else +#define xfs_bmap_trace_exlist(f,ip,c,w) +#endif + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + struct xfs_inode *ip, /* incore inode pointer */ + int rsvd); /* flag for reserved block allocation */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + struct xfs_mount *mp); /* mount point structure */ + +/* + * Routine to clean up the free list data structure when + * an error occurs during a transaction. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist); /* free list to clean up */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + struct xfs_mount *mp, /* file system mount structure */ + int whichfork); /* data or attr fork */ + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. + * + * Return 1 if the given transaction was committed and a new one allocated, + * and 0 otherwise. + */ +int /* error */ +xfs_bmap_finish( + struct xfs_trans **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + xfs_fsblock_t firstblock, /* controlled a.g. for allocs */ + int *committed); /* xact committed or not */ + +/* + * Returns the file-relative block number of the first unused block in the file. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + */ +int /* error */ +xfs_bmap_first_unused( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *unused, /* unused block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_before( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork); /* data or attr fork */ + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent list. + * Returns 0 for local files, as they do not have an extent list. + */ +int /* error */ +xfs_bmap_last_offset( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *unused, /* last block num */ + int whichfork); /* data or attr fork */ + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int +xfs_bmap_one_block( + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +/* + * Read in the extents to iu_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. + */ +int /* error */ +xfs_bmap_read_extents( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork); /* data or attr fork */ + +/* + * Map file blocks to filesystem blocks. + * File range is given by the bno/len pair. + * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) + * into a hole or past eof. + * Only allocates blocks from a single allocation group, + * to avoid locking problems. + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int /* error */ +xfs_bmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + xfs_bmap_free_t *flist); /* i/o: list extents to free */ + +/* + * Map file blocks to filesystem blocks, simple version. + * One block only, read-only. + * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. + * For the other flag values, the effect is as if XFS_BMAPI_METADATA + * was set and all the others were clear. + */ +int /* error */ +xfs_bmapi_single( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + xfs_fsblock_t *fsb, /* output: mapped block */ + xfs_fileoff_t bno); /* starting file offs. mapped */ + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* XFS_BMAPI_... */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done); /* set if not done yet */ + +/* + * Fcntl interface to xfs_bmapi. + */ +int /* error code */ +xfs_getbmap( + bhv_desc_t *bdp, /* XFS behavior descriptor*/ + struct getbmap *bmv, /* user bmap structure */ + void __user *ap, /* pointer to user's array */ + int iflags); /* interface flags */ + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +int +xfs_bmap_isaeof( + struct xfs_inode *ip, + xfs_fileoff_t off, + int whichfork, + char *aeof); + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof); + +/* + * Count fsblocks of the given fork. + */ +int +xfs_bmap_count_blocks( + xfs_trans_t *tp, + struct xfs_inode *ip, + int whichfork, + int *count); + +/* + * Check an extent list, which has just been read, for + * any bit in the extent flag field. + */ +int +xfs_check_nostate_extents( + xfs_bmbt_rec_t *ep, + xfs_extnum_t num); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c new file mode 100644 index 00000000000..163305a79fc --- /dev/null +++ b/fs/xfs/xfs_bmap_btree.c @@ -0,0 +1,2807 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" + +#if defined(XFS_BMBT_TRACE) +ktrace_t *xfs_bmbt_trace_buf; +#endif + +/* + * Prototypes for internal btree functions. + */ + + +STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *); +STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, + xfs_bmbt_key_t *, xfs_btree_cur_t **, int *); +STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); + + +#if defined(XFS_BMBT_TRACE) + +static char ARGS[] = "args"; +static char ENTRY[] = "entry"; +static char ERROR[] = "error"; +#undef EXIT +static char EXIT[] = "exit"; + +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +STATIC void +xfs_bmbt_trace_enter( + char *func, + xfs_btree_cur_t *cur, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + xfs_inode_t *ip; + int whichfork; + + ip = cur->bc_private.b.ip; + whichfork = cur->bc_private.b.whichfork; + ktrace_enter(xfs_bmbt_trace_buf, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); + ASSERT(ip->i_btrace); + ktrace_enter(ip->i_btrace, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} +/* + * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. + */ +STATIC void +xfs_bmbt_trace_argbi( + char *func, + xfs_btree_cur_t *cur, + xfs_buf_t *b, + int i, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line, + (__psunsigned_t)b, i, 0, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 2 integer args. + */ +STATIC void +xfs_bmbt_trace_argbii( + char *func, + xfs_btree_cur_t *cur, + xfs_buf_t *b, + int i0, + int i1, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line, + (__psunsigned_t)b, i0, i1, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for 3 block-length args + * and an integer arg. + */ +STATIC void +xfs_bmbt_trace_argfffi( + char *func, + xfs_btree_cur_t *cur, + xfs_dfiloff_t o, + xfs_dfsbno_t b, + xfs_dfilblks_t i, + int j, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line, + o >> 32, (int)o, b >> 32, (int)b, + i >> 32, (int)i, (int)j, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for one integer arg. + */ +STATIC void +xfs_bmbt_trace_argi( + char *func, + xfs_btree_cur_t *cur, + int i, + int line) +{ + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line, + i, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, key. + */ +STATIC void +xfs_bmbt_trace_argifk( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_fsblock_t f, + xfs_bmbt_key_t *k, + int line) +{ + xfs_dfsbno_t d; + xfs_dfiloff_t o; + + d = (xfs_dfsbno_t)f; + o = INT_GET(k->br_startoff, ARCH_CONVERT); + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, + i, d >> 32, (int)d, o >> 32, + (int)o, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, rec. + */ +STATIC void +xfs_bmbt_trace_argifr( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_fsblock_t f, + xfs_bmbt_rec_t *r, + int line) +{ + xfs_dfsbno_t b; + xfs_dfilblks_t c; + xfs_dfsbno_t d; + xfs_dfiloff_t o; + xfs_bmbt_irec_t s; + + d = (xfs_dfsbno_t)f; + xfs_bmbt_disk_get_all(r, &s); + o = (xfs_dfiloff_t)s.br_startoff; + b = (xfs_dfsbno_t)s.br_startblock; + c = s.br_blockcount; + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line, + i, d >> 32, (int)d, o >> 32, + (int)o, b >> 32, (int)b, c >> 32, + (int)c, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, key. + */ +STATIC void +xfs_bmbt_trace_argik( + char *func, + xfs_btree_cur_t *cur, + int i, + xfs_bmbt_key_t *k, + int line) +{ + xfs_dfiloff_t o; + + o = INT_GET(k->br_startoff, ARCH_CONVERT); + xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, + i, o >> 32, (int)o, 0, + 0, 0, 0, 0, + 0, 0, 0); +} + +/* + * Add a trace buffer entry for the cursor/operation. + */ +STATIC void +xfs_bmbt_trace_cursor( + char *func, + xfs_btree_cur_t *cur, + char *s, + int line) +{ + xfs_bmbt_rec_t r; + + xfs_bmbt_set_all(&r, &cur->bc_rec.b); + xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line, + (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) | + cur->bc_private.b.allocated, + INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT), + (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1], + (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3], + (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], + (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); +} + +#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ + xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) +#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ + xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) +#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ + xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) +#define XFS_BMBT_TRACE_ARGI(c,i) \ + xfs_bmbt_trace_argi(fname, c, i, __LINE__) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \ + xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__) +#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ + xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) +#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ + xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) +#define XFS_BMBT_TRACE_CURSOR(c,s) \ + xfs_bmbt_trace_cursor(fname, c, s, __LINE__) +#else +#define XFS_BMBT_TRACE_ARGBI(c,b,i) +#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) +#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) +#define XFS_BMBT_TRACE_ARGI(c,i) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) +#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) +#define XFS_BMBT_TRACE_ARGIK(c,i,k) +#define XFS_BMBT_TRACE_CURSOR(c,s) +#endif /* XFS_BMBT_TRACE */ + + +/* + * Internal functions. + */ + +/* + * Delete record pointed to by cur/level. + */ +STATIC int /* error */ +xfs_bmbt_delrec( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_fsblock_t bno; /* fs-relative block number */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_delrec"; +#endif + int i; /* loop counter */ + int j; /* temp state */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ + xfs_fsblock_t lbno; /* left sibling block number */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + int lrecs=0; /* left record count */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + int ptr; /* key/record index */ + xfs_fsblock_t rbno; /* right sibling block number */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_block_t *rrblock; /* right-right btree block */ + xfs_buf_t *rrbp; /* right-right buffer pointer */ + int rrecs=0; /* right record count */ + xfs_bmbt_rec_t *rrp; /* right record pointer */ + xfs_btree_cur_t *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + int numlrecs, numrrecs; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ptr = cur->bc_ptrs[level]; + tcur = (xfs_btree_cur_t *)0; + if (ptr == 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + block = xfs_bmbt_get_block(cur, level, &bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + if (ptr > numrecs) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + XFS_STATS_INC(xs_bmbt_delrec); + if (level > 0) { + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); +#ifdef DEBUG + for (i = ptr; i < numrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } +#endif + if (ptr < numrecs) { + memmove(&kp[ptr - 1], &kp[ptr], + (numrecs - ptr) * sizeof(*kp)); + memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */ + (numrecs - ptr) * sizeof(*pp)); + xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1); + xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1); + } + } else { + rp = XFS_BMAP_REC_IADDR(block, 1, cur); + if (ptr < numrecs) { + memmove(&rp[ptr - 1], &rp[ptr], + (numrecs - ptr) * sizeof(*rp)); + xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); + } + if (ptr == 1) { + INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp)); + kp = &key; + } + } + numrecs--; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); + /* + * We're at the root level. + * First, shrink the root block in-memory. + * Try to get rid of the next level down. + * If we can't then there's nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + if ((error = xfs_bmbt_killroot(cur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT); + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK && + level == cur->bc_nlevels - 2) { + if ((error = xfs_bmbt_killroot(cur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK); + if ((error = xfs_btree_dup_cursor(cur, &tcur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + bno = NULLFSBLOCK; + if (rbno != NULLFSBLOCK) { + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_increment(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + rbp = tcur->bc_bufs[level]; + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + bno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if ((error = xfs_bmbt_lshift(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_BMAP_BLOCK_IMINRECS(level, tcur)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level > 0) { + if ((error = xfs_bmbt_decrement(cur, + level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, + ERROR); + goto error0; + } + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (lbno != NULLFSBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_decrement(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + if (lbno != NULLFSBLOCK) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * decrement to last in block + */ + if ((error = xfs_bmbt_decrement(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + lbp = tcur->bc_bufs[level]; + left = XFS_BUF_TO_BMBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } +#endif + bno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_BMAP_BLOCK_IMINRECS(level, cur)) { + if ((error = xfs_bmbt_rshift(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_BMAP_BLOCK_IMINRECS(level, tcur)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + mp = cur->bc_mp; + ASSERT(bno != NULLFSBLOCK); + if (lbno != NULLFSBLOCK && + lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + rbno = bno; + right = block; + rbp = bp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } else if (rbno != NULLFSBLOCK && + rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + lbno = bno; + left = block; + lbp = bp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + right = XFS_BUF_TO_BMBT_BLOCK(rbp); + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } else { + if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur); + lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < numrrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + } +#endif + memcpy(lkp, rkp, numrrecs * sizeof(*lkp)); + memcpy(lpp, rpp, numrrecs * sizeof(*lpp)); + xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + } else { + lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + memcpy(lrp, rrp, numrrecs * sizeof(*lrp)); + xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs); + left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */ + xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS); + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, + INT_GET(left->bb_rightsib, ARCH_CONVERT), + 0, &rrbp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); + if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); + xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1, + cur->bc_private.b.flist, mp); + cur->bc_private.b.ip->i_d.di_nblocks--; + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(cur->bc_tp, rbp); + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + goto error0; + } + if (level > 0) + cur->bc_ptrs[level]--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 2; + return 0; + +error0: + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +#ifdef DEBUG +/* + * Get the data from the pointed-to record. + */ +int +xfs_bmbt_get_rec( + xfs_btree_cur_t *cur, + xfs_fileoff_t *off, + xfs_fsblock_t *bno, + xfs_filblks_t *len, + xfs_exntst_t *state, + int *stat) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; +#ifdef DEBUG + int error; +#endif + int ptr; + xfs_bmbt_rec_t *rp; + + block = xfs_bmbt_get_block(cur, 0, &bp); + ptr = cur->bc_ptrs[0]; +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) + return error; +#endif + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) { + *stat = 0; + return 0; + } + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + *off = xfs_bmbt_disk_get_startoff(rp); + *bno = xfs_bmbt_disk_get_startblock(rp); + *len = xfs_bmbt_disk_get_blockcount(rp); + *state = xfs_bmbt_disk_get_state(rp); + *stat = 1; + return 0; +} +#endif + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int /* error */ +xfs_bmbt_insrec( + xfs_btree_cur_t *cur, + int level, + xfs_fsblock_t *bnop, + xfs_bmbt_rec_t *recp, + xfs_btree_cur_t **curp, + int *stat) /* no-go/done/continue */ +{ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_insrec"; +#endif + int i; /* loop index */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ + int logflags; /* inode logging flags */ + xfs_fsblock_t nbno; /* new block number */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + xfs_bmbt_key_t nkey; /* new btree key value */ + xfs_bmbt_rec_t nrec; /* new record count */ + int optr; /* old key/record index */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + int ptr; /* key/record index */ + xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */ + int numrecs; + + ASSERT(level < cur->bc_nlevels); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); + ncur = (xfs_btree_cur_t *)0; + INT_SET(key.br_startoff, ARCH_CONVERT, + xfs_bmbt_disk_get_startoff(recp)); + optr = ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + XFS_STATS_INC(xs_bmbt_insrec); + block = xfs_bmbt_get_block(cur, level, &bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (ptr <= numrecs) { + if (level == 0) { + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp); + } else { + kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); + xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp); + } + } +#endif + nbno = NULLFSBLOCK; + if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { + /* + * A root block, that can be made bigger. + */ + xfs_iroot_realloc(cur->bc_private.b.ip, 1, + cur->bc_private.b.whichfork); + block = xfs_bmbt_get_block(cur, level, &bp); + } else if (level == cur->bc_nlevels - 1) { + if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) || + *stat == 0) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + logflags); + block = xfs_bmbt_get_block(cur, level, &bp); + } else { + if ((error = xfs_bmbt_rshift(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (i) { + /* nothing */ + } else { + if ((error = xfs_bmbt_lshift(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (i) { + optr = ptr = cur->bc_ptrs[level]; + } else { + if ((error = xfs_bmbt_split(cur, level, + &nbno, &nkey, &ncur, + &i))) { + XFS_BMBT_TRACE_CURSOR(cur, + ERROR); + return error; + } + if (i) { + block = xfs_bmbt_get_block( + cur, level, &bp); +#ifdef DEBUG + if ((error = + xfs_btree_check_lblock(cur, + block, level, bp))) { + XFS_BMBT_TRACE_CURSOR( + cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + xfs_bmbt_disk_set_allf(&nrec, + nkey.br_startoff, 0, 0, + XFS_EXT_NORM); + } else { + XFS_BMBT_TRACE_CURSOR(cur, + EXIT); + *stat = 0; + return 0; + } + } + } + } + } + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (level > 0) { + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); +#ifdef DEBUG + for (i = numrecs; i >= ptr; i--) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memmove(&kp[ptr], &kp[ptr - 1], + (numrecs - ptr + 1) * sizeof(*kp)); + memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */ + (numrecs - ptr + 1) * sizeof(*pp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop, + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + kp[ptr - 1] = key; + INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_keys(cur, bp, ptr, numrecs); + xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs); + } else { + rp = XFS_BMAP_REC_IADDR(block, 1, cur); + memmove(&rp[ptr], &rp[ptr - 1], + (numrecs - ptr + 1) * sizeof(*rp)); + rp[ptr - 1] = *recp; + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_bmbt_log_recs(cur, bp, ptr, numrecs); + } + xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); +#ifdef DEBUG + if (ptr < numrecs) { + if (level == 0) + xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1, + rp + ptr); + else + xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1, + kp + ptr); + } +#endif + if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + *bnop = nbno; + if (nbno != NULLFSBLOCK) { + *recp = nrec; + *curp = ncur; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_bmbt_killroot( + xfs_btree_cur_t *cur) +{ + xfs_bmbt_block_t *block; + xfs_bmbt_block_t *cblock; + xfs_buf_t *cbp; + xfs_bmbt_key_t *ckp; + xfs_bmbt_ptr_t *cpp; +#ifdef DEBUG + int error; +#endif +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_killroot"; +#endif + int i; + xfs_bmbt_key_t *kp; + xfs_inode_t *ip; + xfs_ifork_t *ifp; + int level; + xfs_bmbt_ptr_t *pp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = cur->bc_nlevels - 1; + ASSERT(level >= 1); + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + if (level == 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + block = xfs_bmbt_get_block(cur, level, &cbp); + /* + * Give up if the root has multiple children. + */ + if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + cbp = cur->bc_bufs[level - 1]; + cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); + if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO); + ip = cur->bc_private.b.ip; + ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork); + ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) == + XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes)); + i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur)); + if (i) { + xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + INT_MOD(block->bb_numrecs, ARCH_CONVERT, i); + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); + memcpy(kp, ckp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp)); + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memcpy(pp, cpp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp)); + xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1, + cur->bc_private.b.flist, cur->bc_mp); + ip->i_d.di_nblocks--; + XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip, + XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(cur->bc_tp, cbp); + cur->bc_bufs[level - 1] = NULL; + INT_MOD(block->bb_level, ARCH_CONVERT, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_bmbt_log_keys( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int kfirst, + int klast) +{ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_keys"; +#endif + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast); + tp = cur->bc_tp; + if (bp) { + xfs_bmbt_block_t *block; + int first; + xfs_bmbt_key_t *kp; + int last; + + block = XFS_BUF_TO_BMBT_BLOCK(bp); + kp = XFS_BMAP_KEY_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + } else { + xfs_inode_t *ip; + + ip = cur->bc_private.b.ip; + xfs_trans_log_inode(tp, ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Log pointer values from the btree block. + */ +STATIC void +xfs_bmbt_log_ptrs( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int pfirst, + int plast) +{ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_ptrs"; +#endif + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast); + tp = cur->bc_tp; + if (bp) { + xfs_bmbt_block_t *block; + int first; + int last; + xfs_bmbt_ptr_t *pp; + + block = XFS_BUF_TO_BMBT_BLOCK(bp); + pp = XFS_BMAP_PTR_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + } else { + xfs_inode_t *ip; + + ip = cur->bc_private.b.ip; + xfs_trans_log_inode(tp, ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + */ +STATIC int /* error */ +xfs_bmbt_lookup( + xfs_btree_cur_t *cur, + xfs_lookup_t dir, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block=NULL; + xfs_buf_t *bp; + xfs_daddr_t d; + xfs_sfiloff_t diff; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_lookup"; +#endif + xfs_fsblock_t fsbno=0; + int high; + int i; + int keyno=0; + xfs_bmbt_key_t *kkbase=NULL; + xfs_bmbt_key_t *kkp; + xfs_bmbt_rec_t *krbase=NULL; + xfs_bmbt_rec_t *krp; + int level; + int low; + xfs_mount_t *mp; + xfs_bmbt_ptr_t *pp; + xfs_bmbt_irec_t *rp; + xfs_fileoff_t startoff; + xfs_trans_t *tp; + + XFS_STATS_INC(xs_bmbt_lookup); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, (int)dir); + tp = cur->bc_tp; + mp = cur->bc_mp; + rp = &cur->bc_rec.b; + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + if (level < cur->bc_nlevels - 1) { + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) != d) + bp = (xfs_buf_t *)0; + if (!bp) { + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, + 0, &bp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + xfs_btree_setbuf(cur, level, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, + level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } else + block = XFS_BUF_TO_BMBT_BLOCK(bp); + } else + block = xfs_bmbt_get_block(cur, level, &bp); + if (diff == 0) + keyno = 1; + else { + if (level > 0) + kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur); + else + krbase = XFS_BMAP_REC_IADDR(block, 1, cur); + low = 1; + if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) { + ASSERT(level == 0); + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + while (low <= high) { + XFS_STATS_INC(xs_bmbt_compare); + keyno = (low + high) >> 1; + if (level > 0) { + kkp = kkbase + keyno - 1; + startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT); + } else { + krp = krbase + keyno - 1; + startoff = xfs_bmbt_disk_get_startoff(krp); + } + diff = (xfs_sfiloff_t) + (startoff - rp->br_startoff); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + if (level > 0) { + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + fsbno = INT_GET(*pp, ARCH_CONVERT); + cur->bc_ptrs[level] = keyno; + } + } + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) && + INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + cur->bc_ptrs[0] = keyno; + if ((error = xfs_bmbt_increment(cur, 0, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + } + else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + } else { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); + } + return 0; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_bmbt_lshift( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_lshift"; +#endif +#ifdef DEBUG + int i; /* loop counter */ +#endif + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp=NULL; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + int lrecs; /* left record count */ + xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp=NULL; /* right btree key */ + xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */ + xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ + int rrecs; /* right record count */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + if (level == cur->bc_nlevels - 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + rbp = cur->bc_bufs[level]; + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (cur->bc_ptrs[level] <= 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + mp = cur->bc_mp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, + &lbp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1; + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + *lkp = *rkp; + xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs); + lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + *lpp = *rpp; /* INT_: direct copy */ + xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs); + } else { + lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + *lrp = *rrp; + xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs); + } + INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp); + else + xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp); +#endif + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; + INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs); + xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); + if (level > 0) { +#ifdef DEBUG + for (i = 0; i < rrecs; i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memmove(rkp, rkp + 1, rrecs * sizeof(*rkp)); + memmove(rpp, rpp + 1, rrecs * sizeof(*rpp)); + xfs_bmbt_log_keys(cur, rbp, 1, rrecs); + xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs); + } else { + memmove(rrp, rrp + 1, rrecs * sizeof(*rrp)); + xfs_bmbt_log_recs(cur, rbp, 1, rrecs); + INT_SET(key.br_startoff, ARCH_CONVERT, + xfs_bmbt_disk_get_startoff(rrp)); + rkp = &key; + } + if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[level]--; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_bmbt_rshift( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_rshift"; +#endif + int i; /* loop counter */ + xfs_bmbt_key_t key; /* bmap btree key */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_mount_t *mp; /* file system mount point */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + if (level == cur->bc_nlevels - 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_BMBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + mp = cur->bc_mp; + if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, + &rbp, XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + right = XFS_BUF_TO_BMBT_BLOCK(rbp); + if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + *rkp = *lkp; + *rpp = *lpp; /* INT_: direct copy */ + xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + } else { + lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + *rrp = *lrp; + xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + INT_SET(key.br_startoff, ARCH_CONVERT, + xfs_bmbt_disk_get_startoff(rrp)); + rkp = &key; + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1); + else + xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1); +#endif + xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); + if ((error = xfs_btree_dup_cursor(cur, &tcur))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_increment(tcur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(tcur, ERROR); + goto error1; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) { + XFS_BMBT_TRACE_CURSOR(tcur, ERROR); + goto error1; + } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +error0: + XFS_BMBT_TRACE_CURSOR(cur, ERROR); +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Determine the extent state. + */ +/* ARGSUSED */ +STATIC xfs_exntst_t +xfs_extent_state( + xfs_filblks_t blks, + int extent_flag) +{ + if (extent_flag) { + ASSERT(blks != 0); /* saved for DMIG */ + return XFS_EXT_UNWRITTEN; + } + return XFS_EXT_NORM; +} + + +/* + * Split cur/level block in half. + * Return new block number and its first record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_bmbt_split( + xfs_btree_cur_t *cur, + int level, + xfs_fsblock_t *bnop, + xfs_bmbt_key_t *keyp, + xfs_btree_cur_t **curp, + int *stat) /* success/failure */ +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_split"; +#endif + int i; /* loop counter */ + xfs_fsblock_t lbno; /* left sibling block number */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_bmbt_block_t *left; /* left btree block */ + xfs_bmbt_key_t *lkp; /* left btree key */ + xfs_bmbt_ptr_t *lpp; /* left address pointer */ + xfs_bmbt_rec_t *lrp; /* left record pointer */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_bmbt_block_t *right; /* right btree block */ + xfs_bmbt_key_t *rkp; /* right btree key */ + xfs_bmbt_ptr_t *rpp; /* right address pointer */ + xfs_bmbt_block_t *rrblock; /* right-right btree block */ + xfs_buf_t *rrbp; /* right-right buffer pointer */ + xfs_bmbt_rec_t *rrp; /* right record pointer */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + lbp = cur->bc_bufs[level]; + lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp)); + left = XFS_BUF_TO_BMBT_BLOCK(lbp); + args.fsbno = cur->bc_private.b.firstblock; + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = lbno; + args.type = XFS_ALLOCTYPE_START_BNO; + } else if (cur->bc_private.b.flist->xbf_low) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return XFS_ERROR(ENOSPC); + } + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0); + right = XFS_BUF_TO_BMBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + right->bb_level = left->bb_level; /* INT_: direct copy */ + INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2)); + if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) && + cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1) + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1; + if (level > 0) { + lkp = XFS_BMAP_KEY_IADDR(left, i, cur); + lpp = XFS_BMAP_PTR_IADDR(left, i, cur); + rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); + rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT); + } else { + lrp = XFS_BMAP_REC_IADDR(left, i, cur); + rrp = XFS_BMAP_REC_IADDR(right, 1, cur); + memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp); + } + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT))); + right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */ + INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno); + INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno); + xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + if ((error = xfs_btree_read_bufl(args.mp, args.tp, + INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); + if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno); + xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + if (level + 1 < cur->bc_nlevels) { + if ((error = xfs_btree_dup_cursor(cur, curp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + (*curp)->bc_ptrs[level + 1]++; + } + *bnop = args.fsbno; + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + + +/* + * Update keys for the record. + */ +STATIC int +xfs_bmbt_updkey( + xfs_btree_cur_t *cur, + xfs_bmbt_key_t *keyp, /* on-disk format */ + int level) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; +#ifdef DEBUG + int error; +#endif +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_updkey"; +#endif + xfs_bmbt_key_t *kp; + int ptr; + + ASSERT(level >= 1); + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGIK(cur, level, keyp); + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); + *kp = *keyp; + xfs_bmbt_log_keys(cur, bp, ptr, ptr); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +void +xfs_bmdr_to_bmbt( + xfs_bmdr_block_t *dblock, + int dblocklen, + xfs_bmbt_block_t *rblock, + int rblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_bmbt_key_t *tkp; + xfs_bmbt_ptr_t *tpp; + + INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC); + rblock->bb_level = dblock->bb_level; /* both in on-disk format */ + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0); + rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */ + INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO); + INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO); + dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); + fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); + fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_bmbt_decrement( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_decrement"; +#endif + xfs_fsblock_t fsbno; + int lev; + xfs_mount_t *mp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ASSERT(level < cur->bc_nlevels); + if (level < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + if (--cur->bc_ptrs[level] > 0) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + if (lev < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + if (lev == cur->bc_nlevels) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + tp = cur->bc_tp; + mp = cur->bc_mp; + for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { + fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT); + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Delete the record pointed to by cur. + */ +int /* error */ +xfs_bmbt_delete( + xfs_btree_cur_t *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_delete"; +#endif + int i; + int level; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + for (level = 0, i = 2; i == 2; level++) { + if ((error = xfs_bmbt_delrec(cur, level, &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + if ((error = xfs_bmbt_decrement(cur, level, + &i))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + break; + } + } + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = i; + return 0; +} + +/* + * Convert a compressed bmap extent record to an uncompressed form. + * This code must be in sync with the routines xfs_bmbt_get_startoff, + * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. + */ + +STATIC __inline__ void +__xfs_bmbt_get_all( + __uint64_t l0, + __uint64_t l1, + xfs_bmbt_irec_t *s) +{ + int ext_flag; + xfs_exntst_t st; + + ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); + s->br_startoff = ((xfs_fileoff_t)l0 & + XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +#if XFS_BIG_BLKNOS + s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_fsblock_t)l1) >> 21); +#else +#ifdef DEBUG + { + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_dfsbno_t)l1) >> 21); + ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + s->br_startblock = (xfs_fsblock_t)b; + } +#else /* !DEBUG */ + s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_BLKNOS */ + s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21)); + /* This is xfs_extent_state() in-line */ + if (ext_flag) { + ASSERT(s->br_blockcount != 0); /* saved for DMIG */ + st = XFS_EXT_UNWRITTEN; + } else + st = XFS_EXT_NORM; + s->br_state = st; +} + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(r->l0, r->l1, s); +} + +/* + * Get the block pointer for the given level of the cursor. + * Fill in the buffer pointer, if applicable. + */ +xfs_bmbt_block_t * +xfs_bmbt_get_block( + xfs_btree_cur_t *cur, + int level, + xfs_buf_t **bpp) +{ + xfs_ifork_t *ifp; + xfs_bmbt_block_t *rval; + + if (level < cur->bc_nlevels - 1) { + *bpp = cur->bc_bufs[level]; + rval = XFS_BUF_TO_BMBT_BLOCK(*bpp); + } else { + *bpp = NULL; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + rval = ifp->if_broot; + } + return rval; +} + +/* + * Extract the blockcount field from an in memory bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21)); +} + +/* + * Extract the startblock field from an in memory bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_t *r) +{ +#if XFS_BIG_BLKNOS + return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_fsblock_t)r->l1) >> 21); +#else +#ifdef DEBUG + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) | + (((xfs_dfsbno_t)r->l1) >> 21); + ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + return (xfs_fsblock_t)b; +#else /* !DEBUG */ + return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Extract the startoff field from an in memory bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)r->l0 & + XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_t *r) +{ + int ext_flag; + + ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_get_blockcount(r), + ext_flag); +} + +#if __BYTE_ORDER != __BIG_ENDIAN +/* Endian flipping versions of the bmbt extraction functions */ +void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __uint64_t l0, l1; + + l0 = INT_GET(r->l0, ARCH_CONVERT); + l1 = INT_GET(r->l1, ARCH_CONVERT); + + __xfs_bmbt_get_all(l0, l1, s); +} + +/* + * Extract the blockcount field from an on disk bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_disk_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21)); +} + +/* + * Extract the startblock field from an on disk bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_disk_get_startblock( + xfs_bmbt_rec_t *r) +{ +#if XFS_BIG_BLKNOS + return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | + (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); +#else +#ifdef DEBUG + xfs_dfsbno_t b; + + b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | + (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); + ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + return (xfs_fsblock_t)b; +#else /* !DEBUG */ + return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); +#endif /* DEBUG */ +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Extract the startoff field from a disk format bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_disk_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) & + XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_disk_get_state( + xfs_bmbt_rec_t *r) +{ + int ext_flag; + + ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r), + ext_flag); +} +#endif + + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_bmbt_increment( + xfs_btree_cur_t *cur, + int level, + int *stat) /* success/failure */ +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_increment"; +#endif + xfs_fsblock_t fsbno; + int lev; + xfs_mount_t *mp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGI(cur, level); + ASSERT(level < cur->bc_nlevels); + if (level < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + block = xfs_bmbt_get_block(cur, level, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; + } + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_bmbt_get_block(cur, lev, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) + break; + if (lev < cur->bc_nlevels - 1) + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + if (lev == cur->bc_nlevels) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + tp = cur->bc_tp; + mp = cur->bc_mp; + for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { + fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, + XFS_BMAP_BTREE_REF))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_BMBT_BLOCK(bp); + if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + cur->bc_ptrs[lev] = 1; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 1; + return 0; +} + +/* + * Insert the current record at the point referenced by cur. + */ +int /* error */ +xfs_bmbt_insert( + xfs_btree_cur_t *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_insert"; +#endif + int i; + int level; + xfs_fsblock_t nbno; + xfs_btree_cur_t *ncur; + xfs_bmbt_rec_t nrec; + xfs_btree_cur_t *pcur; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = 0; + nbno = NULLFSBLOCK; + xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); + ncur = (xfs_btree_cur_t *)0; + pcur = cur; + do { + if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, + &i))) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { + cur->bc_nlevels = pcur->bc_nlevels; + cur->bc_private.b.allocated += + pcur->bc_private.b.allocated; + pcur->bc_private.b.allocated = 0; + ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) || + (cur->bc_private.b.ip->i_d.di_flags & + XFS_DIFLAG_REALTIME)); + cur->bc_private.b.firstblock = + pcur->bc_private.b.firstblock; + ASSERT(cur->bc_private.b.flist == + pcur->bc_private.b.flist); + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + if (ncur) { + pcur = ncur; + ncur = (xfs_btree_cur_t *)0; + } + } while (nbno != NULLFSBLOCK); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = i; + return 0; +error0: + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; +} + +/* + * Log fields from the btree block header. + */ +void +xfs_bmbt_log_block( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int fields) +{ + int first; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_block"; +#endif + int last; + xfs_trans_t *tp; + static const short offsets[] = { + offsetof(xfs_bmbt_block_t, bb_magic), + offsetof(xfs_bmbt_block_t, bb_level), + offsetof(xfs_bmbt_block_t, bb_numrecs), + offsetof(xfs_bmbt_block_t, bb_leftsib), + offsetof(xfs_bmbt_block_t, bb_rightsib), + sizeof(xfs_bmbt_block_t) + }; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBI(cur, bp, fields); + tp = cur->bc_tp; + if (bp) { + xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, + &last); + xfs_trans_log_buf(tp, bp, first, last); + } else + xfs_trans_log_inode(tp, cur->bc_private.b.ip, + XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_bmbt_log_recs( + xfs_btree_cur_t *cur, + xfs_buf_t *bp, + int rfirst, + int rlast) +{ + xfs_bmbt_block_t *block; + int first; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_log_recs"; +#endif + int last; + xfs_bmbt_rec_t *rp; + xfs_trans_t *tp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast); + ASSERT(bp); + tp = cur->bc_tp; + block = XFS_BUF_TO_BMBT_BLOCK(bp); + rp = XFS_BMAP_REC_DADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(tp, bp, first, last); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); +} + +int /* error */ +xfs_bmbt_lookup_eq( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +int /* error */ +xfs_bmbt_lookup_ge( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat); +} + +int /* error */ +xfs_bmbt_lookup_le( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Give the bmap btree a new root block. Copy the old broot contents + * down into a real block and make the broot point to it. + */ +int /* error */ +xfs_bmbt_newroot( + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_block_t *block; /* bmap btree block */ + xfs_buf_t *bp; /* buffer for block */ + xfs_bmbt_block_t *cblock; /* child btree block */ + xfs_bmbt_key_t *ckp; /* child key pointer */ + xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ + int error; /* error return code */ +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_newroot"; +#endif +#ifdef DEBUG + int i; /* loop counter */ +#endif + xfs_bmbt_key_t *kp; /* pointer to bmap btree key */ + int level; /* btree level */ + xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + level = cur->bc_nlevels - 1; + block = xfs_bmbt_get_block(cur, level, &bp); + /* + * Copy the root into a real block. + */ + args.mp = cur->bc_mp; + pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + args.tp = cur->bc_tp; + args.fsbno = cur->bc_private.b.firstblock; + args.mod = args.minleft = args.alignment = args.total = args.isfl = + args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (args.fsbno == NULLFSBLOCK) { +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + args.fsbno = INT_GET(*pp, ARCH_CONVERT); + args.type = XFS_ALLOCTYPE_START_BNO; + } else if (args.wasdel) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_NEAR_BNO; + if ((error = xfs_alloc_vextent(&args))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0); + cblock = XFS_BUF_TO_BMBT_BLOCK(bp); + *cblock = *block; + INT_MOD(block->bb_level, ARCH_CONVERT, +1); + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + kp = XFS_BMAP_KEY_IADDR(block, 1, cur); + ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); + memcpy(ckp, kp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp)); + cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + } +#endif + memcpy(cpp, pp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp)); +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno, + level))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + INT_SET(*pp, ARCH_CONVERT, args.fsbno); + xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT), + cur->bc_private.b.whichfork); + xfs_btree_setbuf(cur, level, bp); + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS); + xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT)); + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + *logflags |= + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); + *stat = 1; + return 0; +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + int extent_flag; + + ASSERT((s->br_state == XFS_EXT_NORM) || + (s->br_state == XFS_EXT_UNWRITTEN)); + extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; + ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); + ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); +#if XFS_BIG_BLKNOS + ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + ((xfs_bmbt_rec_base_t)s->br_startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); +#else /* !XFS_BIG_BLKNOS */ + if (ISNULLSTARTBLOCK(s->br_startblock)) { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); + r->l1 = XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } else { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9); + r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set all the fields in a bmap extent record from the arguments. + */ +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v) +{ + int extent_flag; + + ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); + extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; + ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); +#if XFS_BIG_BLKNOS + ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + ((xfs_bmbt_rec_base_t)b >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); +#else /* !XFS_BIG_BLKNOS */ + if (ISNULLSTARTBLOCK(b)) { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); + r->l1 = XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } else { + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9); + r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } +#endif /* XFS_BIG_BLKNOS */ +} + +#if __BYTE_ORDER != __BIG_ENDIAN +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + int extent_flag; + + ASSERT((s->br_state == XFS_EXT_NORM) || + (s->br_state == XFS_EXT_UNWRITTEN)); + extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; + ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); + ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); +#if XFS_BIG_BLKNOS + ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + ((xfs_bmbt_rec_base_t)s->br_startblock >> 43)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); +#else /* !XFS_BIG_BLKNOS */ + if (ISNULLSTARTBLOCK(s->br_startblock)) { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } else { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)s->br_startoff << 9)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | + ((xfs_bmbt_rec_base_t)s->br_blockcount & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set all the fields in a disk format bmap extent record from the arguments. + */ +void +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v) +{ + int extent_flag; + + ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); + extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; + ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); +#if XFS_BIG_BLKNOS + ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + ((xfs_bmbt_rec_base_t)b >> 43)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); +#else /* !XFS_BIG_BLKNOS */ + if (ISNULLSTARTBLOCK(b)) { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9) | + (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } else { + INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)o << 9)); + INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | + ((xfs_bmbt_rec_base_t)c & + (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + } +#endif /* XFS_BIG_BLKNOS */ +} +#endif + +/* + * Set the blockcount field in a bmap extent record. + */ +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_t *r, + xfs_filblks_t v) +{ + ASSERT((v & XFS_MASK64HI(43)) == 0); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) | + (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21)); +} + +/* + * Set the startblock field in a bmap extent record. + */ +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_t *r, + xfs_fsblock_t v) +{ +#if XFS_BIG_BLKNOS + ASSERT((v & XFS_MASK64HI(12)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) | + (xfs_bmbt_rec_base_t)(v >> 43); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) | + (xfs_bmbt_rec_base_t)(v << 21); +#else /* !XFS_BIG_BLKNOS */ + if (ISNULLSTARTBLOCK(v)) { + r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); + r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) | + ((xfs_bmbt_rec_base_t)v << 21) | + (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } else { + r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9); + r->l1 = ((xfs_bmbt_rec_base_t)v << 21) | + (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + } +#endif /* XFS_BIG_BLKNOS */ +} + +/* + * Set the startoff field in a bmap extent record. + */ +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_t *r, + xfs_fileoff_t v) +{ + ASSERT((v & XFS_MASK64HI(9)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) | + ((xfs_bmbt_rec_base_t)v << 9) | + (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); +} + +/* + * Set the extent state field in a bmap extent record. + */ +void +xfs_bmbt_set_state( + xfs_bmbt_rec_t *r, + xfs_exntst_t v) +{ + ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); + if (v == XFS_EXT_NORM) + r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN); + else + r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN); +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_bmbt_to_bmdr( + xfs_bmbt_block_t *rblock, + int rblocklen, + xfs_bmdr_block_t *dblock, + int dblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_bmbt_key_t *tkp; + xfs_bmbt_ptr_t *tpp; + + ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC); + ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO); + ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0); + dblock->bb_level = rblock->bb_level; /* both in on-disk format */ + dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */ + dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); + fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); + tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ +} + +/* + * Update the record to the passed values. + */ +int +xfs_bmbt_update( + xfs_btree_cur_t *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + xfs_bmbt_block_t *block; + xfs_buf_t *bp; + int error; +#ifdef XFS_BMBT_TRACE + static char fname[] = "xfs_bmbt_update"; +#endif + xfs_bmbt_key_t key; + int ptr; + xfs_bmbt_rec_t *rp; + + XFS_BMBT_TRACE_CURSOR(cur, ENTRY); + XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno, + (xfs_dfilblks_t)len, (int)state); + block = xfs_bmbt_get_block(cur, 0, &bp); +#ifdef DEBUG + if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[0]; + rp = XFS_BMAP_REC_IADDR(block, ptr, cur); + xfs_bmbt_disk_set_allf(rp, off, bno, len, state); + xfs_bmbt_log_recs(cur, bp, ptr, ptr); + if (ptr > 1) { + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; + } + INT_SET(key.br_startoff, ARCH_CONVERT, off); + if ((error = xfs_bmbt_updkey(cur, &key, 1))) { + XFS_BMBT_TRACE_CURSOR(cur, ERROR); + return error; + } + XFS_BMBT_TRACE_CURSOR(cur, EXIT); + return 0; +} + +/* + * Check an extent list, which has just been read, for + * any bit in the extent flag field. ASSERT on debug + * kernels, as this condition should not occur. + * Return an error condition (1) if any flags found, + * otherwise return 0. + */ + +int +xfs_check_nostate_extents( + xfs_bmbt_rec_t *ep, + xfs_extnum_t num) +{ + for (; num > 0; num--, ep++) { + if ((ep->l0 >> + (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { + ASSERT(0); + return 1; + } + } + return 0; +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h new file mode 100644 index 00000000000..843ff12b4bf --- /dev/null +++ b/fs/xfs/xfs_bmap_btree.h @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2000,2002-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BMAP_BTREE_H__ +#define __XFS_BMAP_BTREE_H__ + +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ + +struct xfs_btree_cur; +struct xfs_btree_lblock; +struct xfs_mount; +struct xfs_inode; + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block +{ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * For 32-bit kernels, + * l0:31 is an extent flag (value 1 indicates non-normal). + * l0:0-30 and l1:9-31 are startoff. + * l1:0-8, l2:0-31, and l3:21-31 are startblock. + * l3:0-20 are blockcount. + * For 64-bit kernels, + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ +#define BMBT_EXNTFLAG_BITOFF 0 +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN) +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN) +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITOFF \ + (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN) +#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF) + +#else + +#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ +#define BMBT_EXNTFLAG_BITOFF 63 +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN) +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */ +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */ +#define BMBT_BLOCKCOUNT_BITLEN 21 + +#endif + + +#define BMBT_USE_64 1 + +typedef struct xfs_bmbt_rec_32 +{ + __uint32_t l0, l1, l2, l3; +} xfs_bmbt_rec_32_t; +typedef struct xfs_bmbt_rec_64 +{ + __uint64_t l0, l1; +} xfs_bmbt_rec_64_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK) +int isnullstartblock(xfs_fsblock_t x); +#define ISNULLSTARTBLOCK(x) isnullstartblock(x) +#else +#define ISNULLSTARTBLOCK(x) (((x) & STARTBLOCKMASK) == STARTBLOCKMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK) +int isnulldstartblock(xfs_dfsbno_t x); +#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x) +#else +#define ISNULLDSTARTBLOCK(x) (((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK) +xfs_fsblock_t nullstartblock(int k); +#define NULLSTARTBLOCK(k) nullstartblock(k) +#else +#define NULLSTARTBLOCK(k) \ + ((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL) +xfs_filblks_t startblockval(xfs_fsblock_t x); +#define STARTBLOCKVAL(x) startblockval(x) +#else +#define STARTBLOCKVAL(x) ((xfs_filblks_t)((x) & ~STARTBLOCKMASK)) +#endif + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Extent state and extent format macros. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE ) +xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip); +#define XFS_EXTFMT_INODE(x) xfs_extfmt_inode(x) +#else +#define XFS_EXTFMT_INODE(x) \ + (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \ + XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) +#endif +#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key +{ + xfs_dfiloff_t br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */ + /* btree block header type */ +typedef struct xfs_btree_lblock xfs_bmbt_block_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK) +xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp); +#define XFS_BUF_TO_BMBT_BLOCK(bp) xfs_buf_to_bmbt_block(bp) +#else +#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE) +int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) xfs_bmap_rblock_dsize(lev,cur) +#else +#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE) +int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) xfs_bmap_rblock_isize(lev,cur) +#else +#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ + ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ + (cur)->bc_private.b.whichfork)->if_broot_bytes) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE) +int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_IBLOCK_SIZE(lev,cur) xfs_bmap_iblock_size(lev,cur) +#else +#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE) +int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DSIZE(lev,cur) xfs_bmap_block_dsize(lev,cur) +#else +#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \ + XFS_BMAP_IBLOCK_SIZE(lev,cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE) +int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_ISIZE(lev,cur) xfs_bmap_block_isize(lev,cur) +#else +#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \ + XFS_BMAP_IBLOCK_SIZE(lev,cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS) +int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) xfs_bmap_block_dmaxrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ + xfs_bmdr, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS) +int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) xfs_bmap_block_imaxrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \ + xfs_bmbt, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS) +int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) xfs_bmap_block_dminrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ + xfs_bmdr, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS) +int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) xfs_bmap_block_iminrecs(lev,cur) +#else +#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \ + ((lev) == (cur)->bc_nlevels - 1 ? \ + XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \ + xfs_bmbt, (lev) == 0) : \ + ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR) +xfs_bmbt_rec_t * +xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_REC_DADDR(bb,i,cur) xfs_bmap_rec_daddr(bb,i,cur) +#else +#define XFS_BMAP_REC_DADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR) +xfs_bmbt_rec_t * +xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_REC_IADDR(bb,i,cur) xfs_bmap_rec_iaddr(bb,i,cur) +#else +#define XFS_BMAP_REC_IADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR) +xfs_bmbt_key_t * +xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_KEY_DADDR(bb,i,cur) xfs_bmap_key_daddr(bb,i,cur) +#else +#define XFS_BMAP_KEY_DADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR) +xfs_bmbt_key_t * +xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_KEY_IADDR(bb,i,cur) xfs_bmap_key_iaddr(bb,i,cur) +#else +#define XFS_BMAP_KEY_IADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR) +xfs_bmbt_ptr_t * +xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_PTR_DADDR(bb,i,cur) xfs_bmap_ptr_daddr(bb,i,cur) +#else +#define XFS_BMAP_PTR_DADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR) +xfs_bmbt_ptr_t * +xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_BMAP_PTR_IADDR(bb,i,cur) xfs_bmap_ptr_iaddr(bb,i,cur) +#else +#define XFS_BMAP_PTR_IADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur), \ + xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ + INT_GET((bb)->bb_level, ARCH_CONVERT), cur)) +#endif + +/* + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR) +xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) xfs_bmap_broot_rec_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ + XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR) +xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) xfs_bmap_broot_key_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ + XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR) +xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz); +#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) xfs_bmap_broot_ptr_addr(bb,i,sz) +#else +#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ + XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS) +int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb); +#define XFS_BMAP_BROOT_NUMRECS(bb) xfs_bmap_broot_numrecs(bb) +#else +#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS) +int xfs_bmap_broot_maxrecs(int sz); +#define XFS_BMAP_BROOT_MAXRECS(sz) xfs_bmap_broot_maxrecs(sz) +#else +#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC) +int xfs_bmap_broot_space_calc(int nrecs); +#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) xfs_bmap_broot_space_calc(nrecs) +#else +#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ + ((int)(sizeof(xfs_bmbt_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE) +int xfs_bmap_broot_space(xfs_bmdr_block_t *bb); +#define XFS_BMAP_BROOT_SPACE(bb) xfs_bmap_broot_space(bb) +#else +#define XFS_BMAP_BROOT_SPACE(bb) \ + XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC) +int xfs_bmdr_space_calc(int nrecs); +#define XFS_BMDR_SPACE_CALC(nrecs) xfs_bmdr_space_calc(nrecs) +#else +#define XFS_BMDR_SPACE_CALC(nrecs) \ + ((int)(sizeof(xfs_bmdr_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))) +#endif + +/* + * Maximum number of bmap btree levels. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS) +int xfs_bm_maxlevels(struct xfs_mount *mp, int w); +#define XFS_BM_MAXLEVELS(mp,w) xfs_bm_maxlevels(mp,w) +#else +#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[w]) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK) +int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb, + int level); +#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ + xfs_bmap_sanity_check(mp,bb,level) +#else +#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ + (INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \ + INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \ + INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \ + INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0]) +#endif + + +#ifdef __KERNEL__ + +#if defined(XFS_BMBT_TRACE) +/* + * Trace buffer entry types. + */ +#define XFS_BMBT_KTRACE_ARGBI 1 +#define XFS_BMBT_KTRACE_ARGBII 2 +#define XFS_BMBT_KTRACE_ARGFFFI 3 +#define XFS_BMBT_KTRACE_ARGI 4 +#define XFS_BMBT_KTRACE_ARGIFK 5 +#define XFS_BMBT_KTRACE_ARGIFR 6 +#define XFS_BMBT_KTRACE_ARGIK 7 +#define XFS_BMBT_KTRACE_CUR 8 + +#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ +#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ +extern ktrace_t *xfs_bmbt_trace_buf; +#endif + +/* + * Prototypes for xfs_bmap.c to call. + */ + +void +xfs_bmdr_to_bmbt( + xfs_bmdr_block_t *, + int, + xfs_bmbt_block_t *, + int); + +int +xfs_bmbt_decrement( + struct xfs_btree_cur *, + int, + int *); + +int +xfs_bmbt_delete( + struct xfs_btree_cur *, + int *); + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +xfs_bmbt_block_t * +xfs_bmbt_get_block( + struct xfs_btree_cur *cur, + int level, + struct xfs_buf **bpp); + +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_t *r); + +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_t *r); + +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_t *r); + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_t *r); + +#if __BYTE_ORDER != __BIG_ENDIAN +void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +xfs_exntst_t +xfs_bmbt_disk_get_state( + xfs_bmbt_rec_t *r); + +xfs_filblks_t +xfs_bmbt_disk_get_blockcount( + xfs_bmbt_rec_t *r); + +xfs_fsblock_t +xfs_bmbt_disk_get_startblock( + xfs_bmbt_rec_t *r); + +xfs_fileoff_t +xfs_bmbt_disk_get_startoff( + xfs_bmbt_rec_t *r); + +#else +#define xfs_bmbt_disk_get_all(r, s) \ + xfs_bmbt_get_all(r, s) +#define xfs_bmbt_disk_get_state(r) \ + xfs_bmbt_get_state(r) +#define xfs_bmbt_disk_get_blockcount(r) \ + xfs_bmbt_get_blockcount(r) +#define xfs_bmbt_disk_get_startblock(r) \ + xfs_bmbt_get_blockcount(r) +#define xfs_bmbt_disk_get_startoff(r) \ + xfs_bmbt_get_startoff(r) +#endif + +int +xfs_bmbt_increment( + struct xfs_btree_cur *, + int, + int *); + +int +xfs_bmbt_insert( + struct xfs_btree_cur *, + int *); + +void +xfs_bmbt_log_block( + struct xfs_btree_cur *, + struct xfs_buf *, + int); + +void +xfs_bmbt_log_recs( + struct xfs_btree_cur *, + struct xfs_buf *, + int, + int); + +int +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +int +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +int +xfs_bmbt_lookup_le( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + int *); + +/* + * Give the bmap btree a new root block. Copy the old broot contents + * down into a real block and make the broot point to it. + */ +int /* error */ +xfs_bmbt_newroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat); /* return status - 0 fail */ + +void +xfs_bmbt_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v); + +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_t *r, + xfs_filblks_t v); + +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_t *r, + xfs_fsblock_t v); + +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_t *r, + xfs_fileoff_t v); + +void +xfs_bmbt_set_state( + xfs_bmbt_rec_t *r, + xfs_exntst_t v); + +#if __BYTE_ORDER != __BIG_ENDIAN +void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s); + +void +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t o, + xfs_fsblock_t b, + xfs_filblks_t c, + xfs_exntst_t v); +#else +#define xfs_bmbt_disk_set_all(r, s) \ + xfs_bmbt_set_all(r, s) +#define xfs_bmbt_disk_set_allf(r, o, b, c, v) \ + xfs_bmbt_set_allf(r, o, b, c, v) +#endif + +void +xfs_bmbt_to_bmdr( + xfs_bmbt_block_t *, + int, + xfs_bmdr_block_t *, + int); + +int +xfs_bmbt_update( + struct xfs_btree_cur *, + xfs_fileoff_t, + xfs_fsblock_t, + xfs_filblks_t, + xfs_exntst_t); + +#ifdef DEBUG +/* + * Get the data from the pointed-to record. + */ +int +xfs_bmbt_get_rec( + struct xfs_btree_cur *, + xfs_fileoff_t *, + xfs_fsblock_t *, + xfs_filblks_t *, + xfs_exntst_t *, + int *); +#endif + + +/* + * Search an extent list for the extent which includes block + * bno. + */ +xfs_bmbt_rec_t * +xfs_bmap_do_search_extents( + xfs_bmbt_rec_t *, + xfs_extnum_t, + xfs_extnum_t, + xfs_fileoff_t, + int *, + xfs_extnum_t *, + xfs_bmbt_irec_t *, + xfs_bmbt_irec_t *); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c new file mode 100644 index 00000000000..9dd22dd9548 --- /dev/null +++ b/fs/xfs/xfs_btree.c @@ -0,0 +1,949 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains common code for the space manager's btree implementations. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_error.h" + +/* + * Cursor allocation zone. + */ +kmem_zone_t *xfs_btree_cur_zone; + +/* + * Btree magic numbers. + */ +const __uint32_t xfs_magics[XFS_BTNUM_MAX] = +{ + XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +}; + +/* + * Prototypes for internal routines. + */ + +/* + * Checking routine: return maxrecs for the block. + */ +STATIC int /* number of records fitting in block */ +xfs_btree_maxrecs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block);/* generic btree block pointer */ + +/* + * Internal routines. + */ + +/* + * Checking routine: return maxrecs for the block. + */ +STATIC int /* number of records fitting in block */ +xfs_btree_maxrecs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block) /* generic btree block pointer */ +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + case XFS_BTNUM_BMAP: + return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + case XFS_BTNUM_INO: + return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur); + default: + ASSERT(0); + return 0; + } +} + +/* + * External routines. + */ + +#ifdef DEBUG +/* + * Debug routine: check that block header is ok. + */ +void +xfs_btree_check_block( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer containing block, if any */ +{ + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) + xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level, + bp); + else + xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level, + bp); +} + +/* + * Debug routine: check that keys are in the right order. + */ +void +xfs_btree_check_key( + xfs_btnum_t btnum, /* btree identifier */ + void *ak1, /* pointer to left (lower) key */ + void *ak2) /* pointer to right (higher) key */ +{ + switch (btnum) { + case XFS_BTNUM_BNO: { + xfs_alloc_key_t *k1; + xfs_alloc_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_CNT: { + xfs_alloc_key_t *k1; + xfs_alloc_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) || + (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) && + INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT))); + break; + } + case XFS_BTNUM_BMAP: { + xfs_bmbt_key_t *k1; + xfs_bmbt_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_INO: { + xfs_inobt_key_t *k1; + xfs_inobt_key_t *k2; + + k1 = ak1; + k2 = ak2; + ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT)); + break; + } + default: + ASSERT(0); + } +} +#endif /* DEBUG */ + +/* + * Checking routine: check that long form block header is ok. + */ +/* ARGSUSED */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_lblock_t *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer for block, if any */ +{ + int lblock_ok; /* block passes checks */ + xfs_mount_t *mp; /* file system mount point */ + + mp = cur->bc_mp; + lblock_ok = + INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] && + INT_GET(block->bb_level, ARCH_CONVERT) == level && + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && + block->bb_leftsib && + (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) && + block->bb_rightsib && + (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + if (bp) + xfs_buftrace("LBTREE ERROR", bp); + XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Checking routine: check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_dfsbno_t ptr, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_mount_t *mp; /* file system mount point */ + + mp = cur->bc_mp; + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + ptr != NULLDFSBNO && + XFS_FSB_SANITY_CHECK(mp, ptr)); + return 0; +} + +#ifdef DEBUG +/* + * Debug routine: check that records are in the right order. + */ +void +xfs_btree_check_rec( + xfs_btnum_t btnum, /* btree identifier */ + void *ar1, /* pointer to left (lower) record */ + void *ar2) /* pointer to right (higher) record */ +{ + switch (btnum) { + case XFS_BTNUM_BNO: { + xfs_alloc_rec_t *r1; + xfs_alloc_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <= + INT_GET(r2->ar_startblock, ARCH_CONVERT)); + break; + } + case XFS_BTNUM_CNT: { + xfs_alloc_rec_t *r1; + xfs_alloc_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) || + (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) && + INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT))); + break; + } + case XFS_BTNUM_BMAP: { + xfs_bmbt_rec_t *r1; + xfs_bmbt_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(xfs_bmbt_disk_get_startoff(r1) + + xfs_bmbt_disk_get_blockcount(r1) <= + xfs_bmbt_disk_get_startoff(r2)); + break; + } + case XFS_BTNUM_INO: { + xfs_inobt_rec_t *r1; + xfs_inobt_rec_t *r2; + + r1 = ar1; + r2 = ar2; + ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <= + INT_GET(r2->ir_startino, ARCH_CONVERT)); + break; + } + default: + ASSERT(0); + } +} +#endif /* DEBUG */ + +/* + * Checking routine: check that block header is ok. + */ +/* ARGSUSED */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_sblock_t *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + xfs_buf_t *bp) /* buffer containing block */ +{ + xfs_buf_t *agbp; /* buffer for ag. freespace struct */ + xfs_agf_t *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok; /* block passes checks */ + + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = INT_GET(agf->agf_length, ARCH_CONVERT); + sblock_ok = + INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] && + INT_GET(block->bb_level, ARCH_CONVERT) == level && + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= + xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && + (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK || + INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) && + block->bb_leftsib && + (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK || + INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) && + block->bb_rightsib; + if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + if (bp) + xfs_buftrace("SBTREE ERROR", bp); + XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, + cur->bc_mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Checking routine: check that (short) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t ptr, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_buf_t *agbp; /* buffer for ag. freespace struct */ + xfs_agf_t *agf; /* ag. freespace structure */ + + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + ptr != NULLAGBLOCK && ptr != 0 && + ptr < INT_GET(agf->agf_length, ARCH_CONVERT)); + return 0; +} + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error) /* del because of error */ +{ + int i; /* btree level */ + + /* + * Clear the buffer pointers, and release the buffers. + * If we're doing this in the face of an error, we + * need to make sure to inspect all of the entries + * in the bc_bufs array for buffers to be unlocked. + * This is because some of the btree code works from + * level n down to 0, and if we get an error along + * the way we won't have initialized all the entries + * down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) + xfs_btree_setbuf(cur, i, NULL); + else if (!error) + break; + } + /* + * Can't free a bmap cursor without having dealt with the + * allocated indirect blocks' accounting. + */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_private.b.allocated == 0); + /* + * Free the cursor. + */ + kmem_zone_free(xfs_btree_cur_zone, cur); +} + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur) /* output cursor */ +{ + xfs_buf_t *bp; /* btree block's buffer pointer */ + int error; /* error return value */ + int i; /* level number of btree block */ + xfs_mount_t *mp; /* mount structure for filesystem */ + xfs_btree_cur_t *new; /* new cursor value */ + xfs_trans_t *tp; /* transaction pointer, can be NULL */ + + tp = cur->bc_tp; + mp = cur->bc_mp; + /* + * Allocate a new cursor like the old one. + */ + new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp, + cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + /* + * Copy the record currently in the cursor. + */ + new->bc_rec = cur->bc_rec; + /* + * For each level current, re-get the buffer and copy the ptr value. + */ + for (i = 0; i < new->bc_nlevels; i++) { + new->bc_ptrs[i] = cur->bc_ptrs[i]; + new->bc_ra[i] = cur->bc_ra[i]; + if ((bp = cur->bc_bufs[i])) { + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + xfs_btree_del_cursor(new, error); + *ncur = NULL; + return error; + } + new->bc_bufs[i] = bp; + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + } else + new->bc_bufs[i] = NULL; + } + /* + * For bmap btrees, copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + if (new->bc_btnum == XFS_BTNUM_BMAP) { + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + } + *ncur = new; + return 0; +} + +/* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_h.bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be a bmap btree root or from a buffer. + */ +xfs_btree_block_t * /* generic btree block pointer */ +xfs_btree_get_block( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree */ + xfs_buf_t **bpp) /* buffer containing the block */ +{ + xfs_btree_block_t *block; /* return value */ + xfs_buf_t *bp; /* return buffer */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int whichfork; /* data or attr fork */ + + if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) { + whichfork = cur->bc_private.b.whichfork; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork); + block = (xfs_btree_block_t *)ifp->if_broot; + bp = NULL; + } else { + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(block != NULL); + *bpp = bp; + return block; +} + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +xfs_buf_t * /* buffer for fsbno */ +xfs_btree_get_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +xfs_buf_t * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_buf_t *bp; /* buffer pointer (return value) */ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + ASSERT(bp); + ASSERT(!XFS_BUF_GETERROR(bp)); + return bp; +} + +/* + * Allocate a new btree cursor. + * The cursor is either for allocation (A) or bmap (B) or inodes (I). + */ +xfs_btree_cur_t * /* new btree cursor */ +xfs_btree_init_cursor( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* (A only) buffer for agf structure */ + /* (I only) buffer for agi structure */ + xfs_agnumber_t agno, /* (AI only) allocation group number */ + xfs_btnum_t btnum, /* btree identifier */ + xfs_inode_t *ip, /* (B only) inode owning the btree */ + int whichfork) /* (B only) data or attr fork */ +{ + xfs_agf_t *agf; /* (A) allocation group freespace */ + xfs_agi_t *agi; /* (I) allocation group inodespace */ + xfs_btree_cur_t *cur; /* return value */ + xfs_ifork_t *ifp; /* (I) inode fork pointer */ + int nlevels=0; /* number of levels in the btree */ + + ASSERT(xfs_btree_cur_zone != NULL); + /* + * Allocate a new cursor. + */ + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + /* + * Deduce the number of btree levels from the arguments. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + agf = XFS_BUF_TO_AGF(agbp); + nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT); + break; + case XFS_BTNUM_BMAP: + ifp = XFS_IFORK_PTR(ip, whichfork); + nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1; + break; + case XFS_BTNUM_INO: + agi = XFS_BUF_TO_AGI(agbp); + nlevels = INT_GET(agi->agi_level, ARCH_CONVERT); + break; + default: + ASSERT(0); + } + /* + * Fill in the common fields. + */ + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = nlevels; + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + /* + * Fill in private fields. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + /* + * Allocation btree fields. + */ + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + break; + case XFS_BTNUM_BMAP: + /* + * Bmap btree fields. + */ + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + break; + case XFS_BTNUM_INO: + /* + * Inode allocation btree fields. + */ + cur->bc_private.i.agbp = agbp; + cur->bc_private.i.agno = agno; + break; + default: + ASSERT(0); + } + return cur; +} + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) + return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO; + else + return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK; +} + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + xfs_btree_block_t *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_h.bb_numrecs) + return 0; + /* + * Set the ptr value to numrecs, that's the last record/key. + */ + cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT); + return 1; +} + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets, /* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last) /* output: last byte offset */ +{ + int i; /* current bit number */ + __int64_t imask; /* mask for current bit number */ + + ASSERT(fields != 0); + /* + * Find the lowest bit, so the first byte offset. + */ + for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + if (imask & fields) { + *first = offsets[i]; + break; + } + } + /* + * Find the highest bit, so the last byte offset. + */ + for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + if (imask & fields) { + *last = offsets[i + 1] - 1; + break; + } + } +} + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + xfs_buf_t **bpp, /* buffer for fsbno */ + int refval) /* ref count value for buffer */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp))) { + return error; + } + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (bp != NULL) { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + } + *bpp = bp; + return 0; +} + +/* + * Get a buffer for the block, return it read in. + * Short-form addressing. + */ +int /* error */ +xfs_btree_read_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock, /* lock flags for read_buf */ + xfs_buf_t **bpp, /* buffer for agno/agbno */ + int refval) /* ref count value for buffer */ +{ + xfs_buf_t *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp))) { + return error; + } + ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + if (bp != NULL) { + switch (refval) { + case XFS_ALLOC_BTREE_REF: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); + break; + case XFS_INO_BTREE_REF: + XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); + break; + } + } + *bpp = bp; + return 0; +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count) /* count of filesystem blocks */ +{ + xfs_daddr_t d; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); +} + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +int +xfs_btree_readahead_core( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + xfs_alloc_block_t *a; + xfs_bmbt_block_t *b; + xfs_inobt_block_t *i; + int rval = 0; + + ASSERT(cur->bc_bufs[lev] != NULL); + cur->bc_ra[lev] |= lr; + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + INT_GET(a->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + INT_GET(a->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + case XFS_BTNUM_BMAP: + b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + case XFS_BTNUM_INO: + i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); + if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + INT_GET(i->bb_leftsib, ARCH_CONVERT), 1); + rval++; + } + if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, + INT_GET(i->bb_rightsib, ARCH_CONVERT), 1); + rval++; + } + break; + default: + ASSERT(0); + } + return rval; +} + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + xfs_buf_t *bp) /* new buffer to set */ +{ + xfs_btree_block_t *b; /* btree block */ + xfs_buf_t *obp; /* old buffer pointer */ + + obp = cur->bc_bufs[lev]; + if (obp) + xfs_trans_brelse(cur->bc_tp, obp); + cur->bc_bufs[lev] = bp; + cur->bc_ra[lev] = 0; + if (!bp) + return; + b = XFS_BUF_TO_BLOCK(bp); + if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) { + if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } else { + if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h new file mode 100644 index 00000000000..93872bba41f --- /dev/null +++ b/fs/xfs/xfs_btree.h @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BTREE_H__ +#define __XFS_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) + +/* + * Short form header: space allocation btrees. + */ +typedef struct xfs_btree_sblock +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ + xfs_agblock_t bb_leftsib; /* left sibling block or NULLAGBLOCK */ + xfs_agblock_t bb_rightsib; /* right sibling block or NULLAGBLOCK */ +} xfs_btree_sblock_t; + +/* + * Long form header: bmap btrees. + */ +typedef struct xfs_btree_lblock +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ + xfs_dfsbno_t bb_leftsib; /* left sibling block or NULLDFSBNO */ + xfs_dfsbno_t bb_rightsib; /* right sibling block or NULLDFSBNO */ +} xfs_btree_lblock_t; + +/* + * Combined header and structure, used by common code. + */ +typedef struct xfs_btree_hdr +{ + __uint32_t bb_magic; /* magic number for block type */ + __uint16_t bb_level; /* 0 is a leaf */ + __uint16_t bb_numrecs; /* current # of data records */ +} xfs_btree_hdr_t; + +typedef struct xfs_btree_block +{ + xfs_btree_hdr_t bb_h; /* header */ + union { + struct { + xfs_agblock_t bb_leftsib; + xfs_agblock_t bb_rightsib; + } s; /* short form pointers */ + struct { + xfs_dfsbno_t bb_leftsib; + xfs_dfsbno_t bb_rightsib; + } l; /* long form pointers */ + } bb_u; /* rest */ +} xfs_btree_block_t; + +/* + * For logging record fields. + */ +#define XFS_BB_MAGIC 0x01 +#define XFS_BB_LEVEL 0x02 +#define XFS_BB_NUMRECS 0x04 +#define XFS_BB_LEFTSIB 0x08 +#define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) + +/* + * Boolean to select which form of xfs_btree_block_t.bb_u to use. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS) +int xfs_btree_long_ptrs(xfs_btnum_t btnum); +#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) +#else +#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) +#endif + +/* + * Magic numbers for btree blocks. + */ +extern const __uint32_t xfs_magics[]; + +/* + * Maximum and minimum records in a btree block. + * Given block size, type prefix, and leaf flag (0 or 1). + * The divisor below is equivalent to lf ? (e1) : (e2) but that produces + * compiler warnings. + */ +#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \ + ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \ + (((lf) * (uint)sizeof(t ## _rec_t)) + \ + ((1 - (lf)) * \ + ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t)))))) +#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \ + (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2) + +/* + * Record, key, and pointer address calculation macros. + * Given block size, type prefix, block pointer, and index of requested entry + * (first entry numbered 1). + */ +#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + ((i) - 1) * sizeof(t ## _rec_t))) +#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + ((i) - 1) * sizeof(t ## _key_t))) +#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \ + ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ + (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) + +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ + +/* + * Btree cursor structure. + * This collects all information needed by the btree code in one place. + */ +typedef struct xfs_btree_cur +{ + struct xfs_trans *bc_tp; /* transaction we're in, if any */ + struct xfs_mount *bc_mp; /* file system mount struct */ + union { + xfs_alloc_rec_t a; + xfs_bmbt_irec_t b; + xfs_inobt_rec_t i; + } bc_rec; /* current insert/search record value */ + struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ + int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ + __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ +#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ + __uint8_t bc_nlevels; /* number of levels in the tree */ + __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ + union { + struct { /* needed for BNO, CNT */ + struct xfs_buf *agbp; /* agf buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } a; + struct { /* needed for BMAP */ + struct xfs_inode *ip; /* pointer to our inode */ + struct xfs_bmap_free *flist; /* list to free after */ + xfs_fsblock_t firstblock; /* 1st blk allocated */ + int allocated; /* count of alloced */ + short forksize; /* fork's inode space */ + char whichfork; /* data or attr fork */ + char flags; /* flags */ +#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ + } b; + struct { /* needed for INO */ + struct xfs_buf *agbp; /* agi buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } i; + } bc_private; /* per-btree type data */ +} xfs_btree_cur_t; + +#define XFS_BTREE_NOERROR 0 +#define XFS_BTREE_ERROR 1 + +/* + * Convert from buffer to btree block header. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK) +xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp); +#define XFS_BUF_TO_BLOCK(bp) xfs_buf_to_block(bp) +#else +#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)(XFS_BUF_PTR(bp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK) +xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp); +#define XFS_BUF_TO_LBLOCK(bp) xfs_buf_to_lblock(bp) +#else +#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK) +xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp); +#define XFS_BUF_TO_SBLOCK(bp) xfs_buf_to_sblock(bp) +#else +#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp))) +#endif + +#ifdef __KERNEL__ + +#ifdef DEBUG +/* + * Debug routine: check that block header is ok. + */ +void +xfs_btree_check_block( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_block_t *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Debug routine: check that keys are in the right order. + */ +void +xfs_btree_check_key( + xfs_btnum_t btnum, /* btree identifier */ + void *ak1, /* pointer to left (lower) key */ + void *ak2); /* pointer to right (higher) key */ + +/* + * Debug routine: check that records are in the right order. + */ +void +xfs_btree_check_rec( + xfs_btnum_t btnum, /* btree identifier */ + void *ar1, /* pointer to left (lower) record */ + void *ar2); /* pointer to right (higher) record */ +#else +#define xfs_btree_check_block(a,b,c,d) +#define xfs_btree_check_key(a,b,c) +#define xfs_btree_check_rec(a,b,c) +#endif /* DEBUG */ + +/* + * Checking routine: check that long form block header is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_lblock_t *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Checking routine: check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_dfsbno_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Checking routine: check that short form block header is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_btree_sblock_t *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block */ + +/* + * Checking routine: check that (short) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agblock_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error); /* del because of error */ + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur);/* output cursor */ + +/* + * Change the cursor to point to the first record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to change */ + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be a bmap btree root or from a buffer. + */ +xfs_btree_block_t * /* generic btree block pointer */ +xfs_btree_get_block( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp); /* buffer containing the block */ + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +struct xfs_buf * /* buffer for fsbno */ +xfs_btree_get_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +struct xfs_buf * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Allocate a new btree cursor. + * The cursor is either for allocation (A) or bmap (B). + */ +xfs_btree_cur_t * /* new btree cursor */ +xfs_btree_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* (A only) buffer for agf structure */ + xfs_agnumber_t agno, /* (A only) allocation group number */ + xfs_btnum_t btnum, /* btree identifier */ + struct xfs_inode *ip, /* (B only) inode owning the btree */ + int whichfork); /* (B only) data/attr fork */ + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to check */ + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to change */ + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets,/* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last); /* output: last byte offset */ + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval);/* ref count value for buffer */ + +/* + * Get a buffer for the block, return it read in. + * Short-form addressing. + */ +int /* error */ +xfs_btree_read_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for agno/agbno */ + int refval);/* ref count value for buffer */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +void /* error */ +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +void /* error */ +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count); /* count of filesystem blocks */ + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +int /* readahead block count */ +xfs_btree_readahead_core( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr); /* left/right bits */ + +static inline int /* readahead block count */ +xfs_btree_readahead( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; + + return xfs_btree_readahead_core(cur, lev, lr); +} + + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + struct xfs_buf *bp); /* new buffer to set */ + +#endif /* __KERNEL__ */ + + +/* + * Min and max functions for extlen, agblock, fileoff, and filblks types. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN) +xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b); +#define XFS_EXTLEN_MIN(a,b) xfs_extlen_min(a,b) +#else +#define XFS_EXTLEN_MIN(a,b) \ + ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ + (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX) +xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b); +#define XFS_EXTLEN_MAX(a,b) xfs_extlen_max(a,b) +#else +#define XFS_EXTLEN_MAX(a,b) \ + ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ + (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN) +xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b); +#define XFS_AGBLOCK_MIN(a,b) xfs_agblock_min(a,b) +#else +#define XFS_AGBLOCK_MIN(a,b) \ + ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ + (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX) +xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b); +#define XFS_AGBLOCK_MAX(a,b) xfs_agblock_max(a,b) +#else +#define XFS_AGBLOCK_MAX(a,b) \ + ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \ + (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN) +xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b); +#define XFS_FILEOFF_MIN(a,b) xfs_fileoff_min(a,b) +#else +#define XFS_FILEOFF_MIN(a,b) \ + ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \ + (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX) +xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b); +#define XFS_FILEOFF_MAX(a,b) xfs_fileoff_max(a,b) +#else +#define XFS_FILEOFF_MAX(a,b) \ + ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \ + (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN) +xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b); +#define XFS_FILBLKS_MIN(a,b) xfs_filblks_min(a,b) +#else +#define XFS_FILBLKS_MIN(a,b) \ + ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \ + (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX) +xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b); +#define XFS_FILBLKS_MAX(a,b) xfs_filblks_max(a,b) +#else +#define XFS_FILBLKS_MAX(a,b) \ + ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \ + (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK) +int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb); +#define XFS_FSB_SANITY_CHECK(mp,fsb) xfs_fsb_sanity_check(mp,fsb) +#else +#define XFS_FSB_SANITY_CHECK(mp,fsb) \ + (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) +#endif + +/* + * Macros to set EFSCORRUPTED & return/branch. + */ +#define XFS_WANT_CORRUPTED_GOTO(x,l) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ + XFS_ERRLEVEL_LOW, NULL); \ + error = XFS_ERROR(EFSCORRUPTED); \ + goto l; \ + } \ + } + +#define XFS_WANT_CORRUPTED_RETURN(x) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ + XFS_ERRLEVEL_LOW, NULL); \ + return XFS_ERROR(EFSCORRUPTED); \ + } \ + } + +#endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c new file mode 100644 index 00000000000..9ab0039f07d --- /dev/null +++ b/fs/xfs/xfs_buf_item.c @@ -0,0 +1,1221 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains the implementation of the xfs_buf_log_item. + * It contains the item operations used to manipulate the buf log + * items as well as utility routines used by the buffer specific + * transaction routines. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_rw.h" +#include "xfs_bit.h" +#include "xfs_error.h" + + +kmem_zone_t *xfs_buf_item_zone; + +#ifdef XFS_TRANS_DEBUG +/* + * This function uses an alternate strategy for tracking the bytes + * that the user requests to be logged. This can then be used + * in conjunction with the bli_orig array in the buf log item to + * catch bugs in our callers' code. + * + * We also double check the bits set in xfs_buf_item_log using a + * simple algorithm to check that every byte is accounted for. + */ +STATIC void +xfs_buf_item_log_debug( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint x; + uint byte; + uint nbytes; + uint chunk_num; + uint word_num; + uint bit_num; + uint bit_set; + uint *wordp; + + ASSERT(bip->bli_logged != NULL); + byte = first; + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); + for (x = 0; x < nbytes; x++) { + chunk_num = byte >> XFS_BLI_SHIFT; + word_num = chunk_num >> BIT_TO_WORD_SHIFT; + bit_num = chunk_num & (NBWORD - 1); + wordp = &(bip->bli_format.blf_data_map[word_num]); + bit_set = *wordp & (1 << bit_num); + ASSERT(bit_set); + byte++; + } +} + +/* + * This function is called when we flush something into a buffer without + * logging it. This happens for things like inodes which are logged + * separately from the buffer. + */ +void +xfs_buf_item_flush_log_debug( + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip; + uint nbytes; + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { + return; + } + + ASSERT(bip->bli_logged != NULL); + nbytes = last - first + 1; + bfset(bip->bli_logged, first, nbytes); +} + +/* + * This function is called to verify that our caller's have logged + * all the bytes that they changed. + * + * It does this by comparing the original copy of the buffer stored in + * the buf log item's bli_orig array to the current copy of the buffer + * and ensuring that all bytes which miscompare are set in the bli_logged + * array of the buf log item. + */ +STATIC void +xfs_buf_item_log_check( + xfs_buf_log_item_t *bip) +{ + char *orig; + char *buffer; + int x; + xfs_buf_t *bp; + + ASSERT(bip->bli_orig != NULL); + ASSERT(bip->bli_logged != NULL); + + bp = bip->bli_buf; + ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(XFS_BUF_PTR(bp) != NULL); + orig = bip->bli_orig; + buffer = XFS_BUF_PTR(bp); + for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) + cmn_err(CE_PANIC, + "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", + bip, bp, orig, x); + } +} +#else +#define xfs_buf_item_log_debug(x,y,z) +#define xfs_buf_item_log_check(x) +#endif + +STATIC void xfs_buf_error_relse(xfs_buf_t *bp); +STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); + +/* + * This returns the number of log iovecs needed to log the + * given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure + * and 1 for each stretch of non-contiguous chunks to be logged. + * Contiguous chunks are logged in a single iovec. + * + * If the XFS_BLI_STALE flag has been set, then log nothing. + */ +uint +xfs_buf_item_size( + xfs_buf_log_item_t *bip) +{ + uint nvecs; + int next_bit; + int last_bit; + xfs_buf_t *bp; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + xfs_buf_item_trace("SIZE STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + return 1; + } + + bp = bip->bli_buf; + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + nvecs = 1; + last_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(last_bit != -1); + nvecs++; + while (last_bit != -1) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + last_bit + 1); + /* + * If we run out of bits, leave the loop, + * else if we find a new set of bits bump the number of vecs, + * else keep scanning the current set of bits. + */ + if (next_bit == -1) { + last_bit = -1; + } else if (next_bit != last_bit + 1) { + last_bit = next_bit; + nvecs++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + + XFS_BLI_CHUNK)) { + last_bit = next_bit; + nvecs++; + } else { + last_bit++; + } + } + + xfs_buf_item_trace("SIZE NORM", bip); + return nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +void +xfs_buf_item_format( + xfs_buf_log_item_t *bip, + xfs_log_iovec_t *log_vector) +{ + uint base_size; + uint nvecs; + xfs_log_iovec_t *vecp; + xfs_buf_t *bp; + int first_bit; + int last_bit; + int next_bit; + uint nbits; + uint buffer_offset; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + bp = bip->bli_buf; + ASSERT(XFS_BUF_BP_ISMAPPED(bp)); + vecp = log_vector; + + /* + * The size of the base structure is the size of the + * declared structure plus the space for the extra words + * of the bitmap. We subtract one from the map size, because + * the first element of the bitmap is accounted for in the + * size of the base structure. + */ + base_size = + (uint)(sizeof(xfs_buf_log_format_t) + + ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); + vecp->i_addr = (xfs_caddr_t)&bip->bli_format; + vecp->i_len = base_size; + vecp++; + nvecs = 1; + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + xfs_buf_item_trace("FORMAT STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + bip->bli_format.blf_size = nvecs; + return; + } + + /* + * Fill in an iovec for each set of contiguous chunks. + */ + first_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0); + ASSERT(first_bit != -1); + last_bit = first_bit; + nbits = 1; + for (;;) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, + (uint)last_bit + 1); + /* + * If we run out of bits fill in the last iovec and get + * out of the loop. + * Else if we start a new set of bits then fill in the + * iovec for the series we were looking at and start + * counting the bits in the new one. + * Else we're still in the same set of bits so just + * keep counting and scanning. + */ + if (next_bit == -1) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; + nvecs++; + break; + } else if (next_bit != last_bit + 1) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; + nvecs++; + vecp++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + + XFS_BLI_CHUNK)) { + buffer_offset = first_bit * XFS_BLI_CHUNK; + vecp->i_addr = xfs_buf_offset(bp, buffer_offset); + vecp->i_len = nbits * XFS_BLI_CHUNK; +/* You would think we need to bump the nvecs here too, but we do not + * this number is used by recovery, and it gets confused by the boundary + * split here + * nvecs++; + */ + vecp++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else { + last_bit++; + nbits++; + } + } + bip->bli_format.blf_size = nvecs; + + /* + * Check to make sure everything is consistent. + */ + xfs_buf_item_trace("FORMAT NORM", bip); + xfs_buf_item_log_check(bip); +} + +/* + * This is called to pin the buffer associated with the buf log + * item in memory so it cannot be written out. Simply call bpin() + * on the buffer to do this. + */ +void +xfs_buf_item_pin( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("PIN", bip); + xfs_buftrace("XFS_PIN", bp); + xfs_bpin(bp); +} + + +/* + * This is called to unpin the buffer associated with the buf log + * item which was previously pinned with a call to xfs_buf_item_pin(). + * Just call bunpin() on the buffer to do this. + * + * Also drop the reference to the buf item for the current transaction. + * If the XFS_BLI_STALE flag is set and we are the last reference, + * then free up the buf log item and unlock the buffer. + */ +void +xfs_buf_item_unpin( + xfs_buf_log_item_t *bip, + int stale) +{ + xfs_mount_t *mp; + xfs_buf_t *bp; + int freed; + SPLDECL(s); + + bp = bip->bli_buf; + ASSERT(bp != NULL); + ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + xfs_buf_item_trace("UNPIN", bip); + xfs_buftrace("XFS_UNPIN", bp); + + freed = atomic_dec_and_test(&bip->bli_refcount); + mp = bip->bli_item.li_mountp; + xfs_bunpin(bp); + if (freed && stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + xfs_buf_item_trace("UNPIN STALE", bip); + xfs_buftrace("XFS_UNPIN STALE", bp); + /* + * If we get called here because of an IO error, we may + * or may not have the item on the AIL. xfs_trans_delete_ail() + * will take care of that situation. + * xfs_trans_delete_ail() drops the AIL lock. + */ + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + } else { + AIL_LOCK(mp,s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + xfs_buf_item_relse(bp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + } + xfs_buf_relse(bp); + } +} + +/* + * this is called from uncommit in the forced-shutdown path. + * we need to check to see if the reference count on the log item + * is going to drop to zero. If so, unpin will free the log item + * so we need to free the item's descriptor (that points to the item) + * in the transaction. + */ +void +xfs_buf_item_unpin_remove( + xfs_buf_log_item_t *bip, + xfs_trans_t *tp) +{ + xfs_buf_t *bp; + xfs_log_item_desc_t *lidp; + int stale = 0; + + bp = bip->bli_buf; + /* + * will xfs_buf_item_unpin() call xfs_buf_item_relse()? + */ + if ((atomic_read(&bip->bli_refcount) == 1) && + (bip->bli_flags & XFS_BLI_STALE)) { + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); + xfs_buf_item_trace("UNPIN REMOVE", bip); + xfs_buftrace("XFS_UNPIN_REMOVE", bp); + /* + * yes -- clear the xaction descriptor in-use flag + * and free the chunk if required. We can safely + * do some work here and then call buf_item_unpin + * to do the rest because if the if is true, then + * we are holding the buffer locked so no one else + * will be able to bump up the refcount. + */ + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); + stale = lidp->lid_flags & XFS_LID_BUF_STALE; + xfs_trans_free_item(tp, lidp); + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + } + + xfs_buf_item_unpin(bip, stale); + + return; +} + +/* + * This is called to attempt to lock the buffer associated with this + * buf log item. Don't sleep on the buffer lock. If we can't get + * the lock right away, return 0. If we can get the lock, pull the + * buffer from the free list, mark it busy, and return 1. + */ +uint +xfs_buf_item_trylock( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + + if (XFS_BUF_ISPINNED(bp)) { + return XFS_ITEM_PINNED; + } + + if (!XFS_BUF_CPSEMA(bp)) { + return XFS_ITEM_LOCKED; + } + + /* + * Remove the buffer from the free list. Only do this + * if it's on the free list. Private buffers like the + * superblock buffer are not. + */ + XFS_BUF_HOLD(bp); + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("TRYLOCK SUCCESS", bip); + return XFS_ITEM_SUCCESS; +} + +/* + * Release the buffer associated with the buf log item. + * If there is no dirty logged data associated with the + * buffer recorded in the buf log item, then free the + * buf log item and remove the reference to it in the + * buffer. + * + * This call ignores the recursion count. It is only called + * when the buffer should REALLY be unlocked, regardless + * of the recursion count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then + * free the log item if necessary but do not unlock the buffer. + * This is for support of xfs_trans_bhold(). Make sure the + * XFS_BLI_HOLD field is cleared if we don't free the item. + */ +void +xfs_buf_item_unlock( + xfs_buf_log_item_t *bip) +{ + int aborted; + xfs_buf_t *bp; + uint hold; + + bp = bip->bli_buf; + xfs_buftrace("XFS_UNLOCK", bp); + + /* + * Clear the buffer's association with this transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + + /* + * If this is a transaction abort, don't return early. + * Instead, allow the brelse to happen. + * Normally it would be done for stale (cancelled) buffers + * at unpin time, but we'll never go through the pin/unpin + * cycle if we abort inside commit. + */ + aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; + + /* + * If the buf item is marked stale, then don't do anything. + * We'll unlock the buffer and free the buf item when the + * buffer is unpinned for the last time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + bip->bli_flags &= ~XFS_BLI_LOGGED; + xfs_buf_item_trace("UNLOCK STALE", bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + if (!aborted) + return; + } + + /* + * Drop the transaction's reference to the log item if + * it was not logged as part of the transaction. Otherwise + * we'll drop the reference in xfs_buf_item_unpin() when + * the transaction is really through with the buffer. + */ + if (!(bip->bli_flags & XFS_BLI_LOGGED)) { + atomic_dec(&bip->bli_refcount); + } else { + /* + * Clear the logged flag since this is per + * transaction state. + */ + bip->bli_flags &= ~XFS_BLI_LOGGED; + } + + /* + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. + */ + hold = bip->bli_flags & XFS_BLI_HOLD; + xfs_buf_item_trace("UNLOCK", bip); + + /* + * If the buf item isn't tracking any data, free it. + * Otherwise, if XFS_BLI_HOLD is set clear it. + */ + if (xfs_count_bits(bip->bli_format.blf_data_map, + bip->bli_format.blf_map_size, 0) == 0) { + xfs_buf_item_relse(bp); + } else if (hold) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Release the buffer if XFS_BLI_HOLD was not set. + */ + if (!hold) { + xfs_buf_relse(bp); + } +} + +/* + * This is called to find out where the oldest active copy of the + * buf log item in the on disk log resides now that the last log + * write of it completed at the given lsn. + * We always re-log all the dirty data in a buffer, so usually the + * latest copy in the on disk log is the only one that matters. For + * those cases we simply return the given lsn. + * + * The one exception to this is for buffers full of newly allocated + * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF + * flag set, indicating that only the di_next_unlinked fields from the + * inodes in the buffers will be replayed during recovery. If the + * original newly allocated inode images have not yet been flushed + * when the buffer is so relogged, then we need to make sure that we + * keep the old images in the 'active' portion of the log. We do this + * by returning the original lsn of that transaction here rather than + * the current one. + */ +xfs_lsn_t +xfs_buf_item_committed( + xfs_buf_log_item_t *bip, + xfs_lsn_t lsn) +{ + xfs_buf_item_trace("COMMITTED", bip); + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + (bip->bli_item.li_lsn != 0)) { + return bip->bli_item.li_lsn; + } + return (lsn); +} + +/* + * This is called when the transaction holding the buffer is aborted. + * Just behave as if the transaction had been cancelled. If we're shutting down + * and have aborted this transaction, we'll trap this buffer when it tries to + * get written out. + */ +void +xfs_buf_item_abort( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + bp = bip->bli_buf; + xfs_buftrace("XFS_ABORT", bp); + XFS_BUF_SUPER_STALE(bp); + xfs_buf_item_unlock(bip); + return; +} + +/* + * This is called to asynchronously write the buffer associated with this + * buf log item out to disk. The buffer will already have been locked by + * a successful call to xfs_buf_item_trylock(). If the buffer still has + * B_DELWRI set, then get it going out to disk with a call to bawrite(). + * If not, then just release the buffer. + */ +void +xfs_buf_item_push( + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_trace("PUSH", bip); + + bp = bip->bli_buf; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_bawrite(bip->bli_item.li_mountp, bp); + } else { + xfs_buf_relse(bp); + } +} + +/* ARGSUSED */ +void +xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_buf_item_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) + xfs_buf_item_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_buf_item_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_buf_item_committing +}; + + +/* + * Allocate a new buf log item to go with the given buffer. + * Set the buffer's b_fsprivate field to point to the new + * buf log item. If there are other item's attached to the + * buffer (see xfs_buf_attach_iodone() below), then put the + * buf log item at the front. + */ +void +xfs_buf_item_init( + xfs_buf_t *bp, + xfs_mount_t *mp) +{ + xfs_log_item_t *lip; + xfs_buf_log_item_t *bip; + int chunks; + int map_size; + + /* + * Check to see if there is already a buf log item for + * this buffer. If there is, it is guaranteed to be + * the first. If we do already have one, there is + * nothing to do here so return. + */ + if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) + XFS_BUF_SET_FSPRIVATE3(bp, mp); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (lip->li_type == XFS_LI_BUF) { + return; + } + } + + /* + * chunks is the number of XFS_BLI_CHUNK size pieces + * the buffer can be divided into. Make sure not to + * truncate any pieces. map_size is the size of the + * bitmap needed to describe the chunks of the buffer. + */ + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); + + bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, + KM_SLEEP); + bip->bli_item.li_type = XFS_LI_BUF; + bip->bli_item.li_ops = &xfs_buf_item_ops; + bip->bli_item.li_mountp = mp; + bip->bli_buf = bp; + bip->bli_format.blf_type = XFS_LI_BUF; + bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); + bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_map_size = map_size; +#ifdef XFS_BLI_TRACE + bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); +#endif + +#ifdef XFS_TRANS_DEBUG + /* + * Allocate the arrays for tracking what needs to be logged + * and what our callers request to be logged. bli_orig + * holds a copy of the original, clean buffer for comparison + * against, and bli_logged keeps a 1 bit flag per byte in + * the buffer to indicate which bytes the callers have asked + * to have logged. + */ + bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); + memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); + bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); +#endif + + /* + * Put the buf item into the list of items attached to the + * buffer at the front. + */ + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + bip->bli_item.li_bio_list = + XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + } + XFS_BUF_SET_FSPRIVATE(bp, bip); +} + + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + uint first_bit; + uint last_bit; + uint bits_to_set; + uint bits_set; + uint word_num; + uint *wordp; + uint bit; + uint end_bit; + uint mask; + + /* + * Mark the item as having some dirty data for + * quick reference in xfs_buf_item_dirty. + */ + bip->bli_flags |= XFS_BLI_DIRTY; + + /* + * Convert byte offsets to bit numbers. + */ + first_bit = first >> XFS_BLI_SHIFT; + last_bit = last >> XFS_BLI_SHIFT; + + /* + * Calculate the total number of bits to be set. + */ + bits_to_set = last_bit - first_bit + 1; + + /* + * Get a pointer to the first word in the bitmap + * to set a bit in. + */ + word_num = first_bit >> BIT_TO_WORD_SHIFT; + wordp = &(bip->bli_format.blf_data_map[word_num]); + + /* + * Calculate the starting bit in the first word. + */ + bit = first_bit & (uint)(NBWORD - 1); + + /* + * First set any bits in the first word of our range. + * If it starts at bit 0 of the word, it will be + * set below rather than here. That is what the variable + * bit tells us. The variable bits_set tracks the number + * of bits that have been set so far. End_bit is the number + * of the last bit to be set in this word plus one. + */ + if (bit) { + end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + mask = ((1 << (end_bit - bit)) - 1) << bit; + *wordp |= mask; + wordp++; + bits_set = end_bit - bit; + } else { + bits_set = 0; + } + + /* + * Now set bits a whole word at a time that are between + * first_bit and last_bit. + */ + while ((bits_to_set - bits_set) >= NBWORD) { + *wordp |= 0xffffffff; + bits_set += NBWORD; + wordp++; + } + + /* + * Finally, set any bits left to be set in one last partial word. + */ + end_bit = bits_to_set - bits_set; + if (end_bit) { + mask = (1 << end_bit) - 1; + *wordp |= mask; + } + + xfs_buf_item_log_debug(bip, first, last); +} + + +/* + * Return 1 if the buffer has some data that has been logged (at any + * point, not just the current transaction) and 0 if not. + */ +uint +xfs_buf_item_dirty( + xfs_buf_log_item_t *bip) +{ + return (bip->bli_flags & XFS_BLI_DIRTY); +} + +/* + * This is called when the buf log item is no longer needed. It should + * free the buf log item associated with the given buffer and clear + * the buffer's pointer to the buf log item. If there are no more + * items in the list, clear the b_iodone field of the buffer (see + * xfs_buf_attach_iodone() below). + */ +void +xfs_buf_item_relse( + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + xfs_buftrace("XFS_RELSE", bp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); + if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && + (XFS_BUF_IODONE_FUNC(bp) != NULL)) { + ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0); + XFS_BUF_CLR_IODONE_FUNC(bp); + } + +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); + bip->bli_orig = NULL; + kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); + bip->bli_logged = NULL; +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + + +/* + * Add the given log item with its callback to the list of callbacks + * to be called when the buffer's I/O completes. If it is not set + * already, set the buffer's b_iodone() routine to be + * xfs_buf_iodone_callbacks() and link the log item into the list of + * items rooted at b_fsprivate. Items are always added as the second + * entry in the list if there is a first, because the buf item code + * assumes that the buf log item is first. + */ +void +xfs_buf_attach_iodone( + xfs_buf_t *bp, + void (*cb)(xfs_buf_t *, xfs_log_item_t *), + xfs_log_item_t *lip) +{ + xfs_log_item_t *head_lip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + lip->li_cb = cb; + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + lip->li_bio_list = head_lip->li_bio_list; + head_lip->li_bio_list = lip; + } else { + XFS_BUF_SET_FSPRIVATE(bp, lip); + } + + ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || + (XFS_BUF_IODONE_FUNC(bp) == NULL)); + XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); +} + +STATIC void +xfs_buf_do_callbacks( + xfs_buf_t *bp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *nlip; + + while (lip != NULL) { + nlip = lip->li_bio_list; + ASSERT(lip->li_cb != NULL); + /* + * Clear the next pointer so we don't have any + * confusion if the item is added to another buf. + * Don't touch the log item after calling its + * callback, because it could have freed itself. + */ + lip->li_bio_list = NULL; + lip->li_cb(bp, lip); + lip = nlip; + } +} + +/* + * This is the iodone() function for buffers which have had callbacks + * attached to them by xfs_buf_attach_iodone(). It should remove each + * log item from the buffer's list and call the callback of each in turn. + * When done, the buffer's fsprivate field is set to NULL and the buffer + * is unlocked with a call to iodone(). + */ +void +xfs_buf_iodone_callbacks( + xfs_buf_t *bp) +{ + xfs_log_item_t *lip; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; + xfs_mount_t *mp; + + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + + if (XFS_BUF_GETERROR(bp) != 0) { + /* + * If we've already decided to shutdown the filesystem + * because of IO errors, there's no point in giving this + * a retry. + */ + mp = lip->li_mountp; + if (XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); + XFS_BUF_SUPER_STALE(bp); + xfs_buftrace("BUF_IODONE_CB", bp); + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + + /* + * XFS_SHUT flag gets set when we go thru the + * entire buffer cache and deliberately start + * throwing away delayed write buffers. + * Since there's no biowait done on those, + * we should just brelse them. + */ + if (XFS_BUF_ISSHUT(bp)) { + XFS_BUF_UNSHUT(bp); + xfs_buf_relse(bp); + } else { + xfs_biodone(bp); + } + + return; + } + + if ((XFS_BUF_TARGET(bp) != lasttarg) || + (time_after(jiffies, (lasttime + 5*HZ)))) { + lasttime = jiffies; + prdev("XFS write error in file system meta-data " + "block 0x%llx in %s", + XFS_BUF_TARGET(bp), + (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); + } + lasttarg = XFS_BUF_TARGET(bp); + + if (XFS_BUF_ISASYNC(bp)) { + /* + * If the write was asynchronous then noone will be + * looking for the error. Clear the error state + * and write the buffer out again delayed write. + * + * XXXsup This is OK, so long as we catch these + * before we start the umount; we don't want these + * DELWRI metadata bufs to be hanging around. + */ + XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ + + if (!(XFS_BUF_ISSTALE(bp))) { + XFS_BUF_DELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_SET_START(bp); + } + ASSERT(XFS_BUF_IODONE_FUNC(bp)); + xfs_buftrace("BUF_IODONE ASYNC", bp); + xfs_buf_relse(bp); + } else { + /* + * If the write of the buffer was not asynchronous, + * then we want to make sure to return the error + * to the caller of bwrite(). Because of this we + * cannot clear the B_ERROR state at this point. + * Instead we install a callback function that + * will be called when the buffer is released, and + * that routine will clear the error state and + * set the buffer to be written out again after + * some delay. + */ + /* We actually overwrite the existing b-relse + function at times, but we're gonna be shutting down + anyway. */ + XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); + XFS_BUF_DONE(bp); + XFS_BUF_V_IODONESEMA(bp); + } + return; + } +#ifdef XFSERRORDEBUG + xfs_buftrace("XFS BUFCB NOERR", bp); +#endif + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_biodone(bp); +} + +/* + * This is a callback routine attached to a buffer which gets an error + * when being written out synchronously. + */ +STATIC void +xfs_buf_error_relse( + xfs_buf_t *bp) +{ + xfs_log_item_t *lip; + xfs_mount_t *mp; + + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + mp = (xfs_mount_t *)lip->li_mountp; + ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); + + XFS_BUF_STALE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_ERROR(bp,0); + xfs_buftrace("BUF_ERROR_RELSE", bp); + if (! XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); + /* + * We have to unpin the pinned buffers so do the + * callbacks. + */ + xfs_buf_do_callbacks(bp, lip); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_SET_BRELSE_FUNC(bp,NULL); + xfs_buf_relse(bp); +} + + +/* + * This is the iodone() function for buffers which have been + * logged. It is called when they are eventually flushed out. + * It should remove the buf item from the AIL, and free the buf item. + * It is called by xfs_buf_iodone_callbacks() above which will take + * care of cleaning up the buffer itself. + */ +/* ARGSUSED */ +void +xfs_buf_iodone( + xfs_buf_t *bp, + xfs_buf_log_item_t *bip) +{ + struct xfs_mount *mp; + SPLDECL(s); + + ASSERT(bip->bli_buf == bp); + + mp = bip->bli_item.li_mountp; + + /* + * If we are forcibly shutting down, this may well be + * off the AIL already. That's because we simulate the + * log-committed callbacks to unpin these buffers. Or we may never + * have put this item on AIL because of the transaction was + * aborted forcibly. xfs_trans_delete_ail() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + AIL_LOCK(mp,s); + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); + bip->bli_orig = NULL; + kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); + bip->bli_logged = NULL; +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + +#if defined(XFS_BLI_TRACE) +void +xfs_buf_item_trace( + char *id, + xfs_buf_log_item_t *bip) +{ + xfs_buf_t *bp; + ASSERT(bip->bli_trace != NULL); + + bp = bip->bli_buf; + ktrace_enter(bip->bli_trace, + (void *)id, + (void *)bip->bli_buf, + (void *)((unsigned long)bip->bli_flags), + (void *)((unsigned long)bip->bli_recur), + (void *)((unsigned long)atomic_read(&bip->bli_refcount)), + (void *)((unsigned long) + (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)), + (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))), + (void *)((unsigned long)XFS_BUF_COUNT(bp)), + (void *)((unsigned long)XFS_BUF_BFLAGS(bp)), + XFS_BUF_FSPRIVATE(bp, void *), + XFS_BUF_FSPRIVATE2(bp, void *), + (void *)(unsigned long)XFS_BUF_ISPINNED(bp), + (void *)XFS_BUF_IODONE_FUNC(bp), + (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))), + (void *)bip->bli_item.li_desc, + (void *)((unsigned long)bip->bli_item.li_flags)); +} +#endif /* XFS_BLI_TRACE */ diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h new file mode 100644 index 00000000000..5f1b0c9308f --- /dev/null +++ b/fs/xfs/xfs_buf_item.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_BUF_ITEM_H__ +#define __XFS_BUF_ITEM_H__ + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. This structure works only on buffers that + * reside up to the first TB in the filesystem. These buffers are + * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF. + */ +typedef struct xfs_buf_log_format_v1 { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + __int32_t blf_blkno; /* starting blkno of this buf */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + unsigned int blf_map_size; /* size of data bitmap in words */ + unsigned int blf_data_map[1];/* variable size bitmap of */ + /* regions of buffer in this item */ +} xfs_buf_log_format_v1_t; + +/* + * This is a form of the above structure with a 64 bit blkno field. + * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. + */ +typedef struct xfs_buf_log_format_t { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* size of data bitmap in words */ + unsigned int blf_data_map[1];/* variable size bitmap of */ + /* regions of buffer in this item */ +} xfs_buf_log_format_t; + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLI_INODE_BUF 0x1 +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLI_CANCEL 0x2 +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLI_UDQUOT_BUF 0x4 +/* #define XFS_BLI_PDQUOT_BUF 0x8 */ +#define XFS_BLI_GDQUOT_BUF 0x10 + +#define XFS_BLI_CHUNK 128 +#define XFS_BLI_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * buf log item flags + */ +#define XFS_BLI_HOLD 0x01 +#define XFS_BLI_DIRTY 0x02 +#define XFS_BLI_STALE 0x04 +#define XFS_BLI_LOGGED 0x08 +#define XFS_BLI_INODE_ALLOC_BUF 0x10 +#define XFS_BLI_STALE_INODE 0x20 + + +#ifdef __KERNEL__ + +struct xfs_buf; +struct ktrace; +struct xfs_mount; +struct xfs_buf_log_item; + +#if defined(XFS_BLI_TRACE) +#define XFS_BLI_TRACE_SIZE 32 + +void xfs_buf_item_trace(char *, struct xfs_buf_log_item *); +#else +#define xfs_buf_item_trace(id, bip) +#endif + +/* + * This is the in core log item structure used to track information + * needed to log buffers. It tracks how many times the lock has been + * locked, and which 128 byte chunks of the buffer are dirty. + */ +typedef struct xfs_buf_log_item { + xfs_log_item_t bli_item; /* common item structure */ + struct xfs_buf *bli_buf; /* real buffer pointer */ + unsigned int bli_flags; /* misc flags */ + unsigned int bli_recur; /* lock recursion count */ + atomic_t bli_refcount; /* cnt of tp refs */ +#ifdef XFS_BLI_TRACE + struct ktrace *bli_trace; /* event trace buf */ +#endif +#ifdef XFS_TRANS_DEBUG + char *bli_orig; /* original buffer copy */ + char *bli_logged; /* bytes logged (bitmap) */ +#endif + xfs_buf_log_format_t bli_format; /* in-log header */ +} xfs_buf_log_item_t; + +/* + * This structure is used during recovery to record the buf log + * items which have been canceled and should not be replayed. + */ +typedef struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct xfs_buf_cancel *bc_next; +} xfs_buf_cancel_t; + +void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_relse(struct xfs_buf *); +void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +void xfs_buf_attach_iodone(struct xfs_buf *, + void(*)(struct xfs_buf *, xfs_log_item_t *), + xfs_log_item_t *); +void xfs_buf_iodone_callbacks(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); + +#ifdef XFS_TRANS_DEBUG +void +xfs_buf_item_flush_log_debug( + struct xfs_buf *bp, + uint first, + uint last); +#else +#define xfs_buf_item_flush_log_debug(bp, first, last) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h new file mode 100644 index 00000000000..2deac730375 --- /dev/null +++ b/fs/xfs/xfs_cap.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CAP_H__ +#define __XFS_CAP_H__ + +/* + * Capabilities + */ +typedef __uint64_t xfs_cap_value_t; + +typedef struct xfs_cap_set { + xfs_cap_value_t cap_effective; /* use in capability checks */ + xfs_cap_value_t cap_permitted; /* combined with file attrs */ + xfs_cap_value_t cap_inheritable;/* pass through exec */ +} xfs_cap_set_t; + +/* On-disk XFS extended attribute names */ +#define SGI_CAP_FILE "SGI_CAP_FILE" +#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1) +#define SGI_CAP_LINUX "SGI_CAP_LINUX" +#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1) + +/* + * For Linux, we take the bitfields directly from capability.h + * and no longer attempt to keep this attribute ondisk compatible + * with IRIX. Since this attribute is only set on exectuables, + * it just doesn't make much sense to try. We do use a different + * named attribute though, to avoid confusion. + */ + +#ifdef __KERNEL__ + +#ifdef CONFIG_FS_POSIX_CAP + +#include + +struct vnode; + +extern int xfs_cap_vhascap(struct vnode *); +extern int xfs_cap_vset(struct vnode *, void *, size_t); +extern int xfs_cap_vget(struct vnode *, void *, size_t); +extern int xfs_cap_vremove(struct vnode *vp); + +#define _CAP_EXISTS xfs_cap_vhascap + +#else +#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP) +#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP) +#define xfs_cap_vremove(v) (-EOPNOTSUPP) +#define _CAP_EXISTS (NULL) +#endif + +#endif /* __KERNEL__ */ + +#endif /* __XFS_CAP_H__ */ diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h new file mode 100644 index 00000000000..b3215ffe0be --- /dev/null +++ b/fs/xfs/xfs_clnt.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CLNT_H__ +#define __XFS_CLNT_H__ + +/* + * XFS arguments structure, constructed from the arguments we + * are passed via the mount system call. + * + * NOTE: The mount system call is handled differently between + * Linux and IRIX. In IRIX we worked work with a binary data + * structure coming in across the syscall interface from user + * space (the mount userspace knows about each filesystem type + * and the set of valid options for it, and converts the users + * argument string into a binary structure _before_ making the + * system call), and the ABI issues that this implies. + * + * In Linux, we are passed a comma separated set of options; + * ie. a NULL terminated string of characters. Userspace mount + * code does not have any knowledge of mount options expected by + * each filesystem type and so each filesystem parses its mount + * options in kernel space. + * + * For the Linux port, we kept this structure pretty much intact + * and use it internally (because the existing code groks it). + */ +struct xfs_mount_args { + int flags; /* flags -> see XFSMNT_... macros below */ + int logbufs; /* Number of log buffers, -1 to default */ + int logbufsize; /* Size of log buffers, -1 to default */ + char fsname[MAXNAMELEN+1]; /* data device name */ + char rtname[MAXNAMELEN+1]; /* realtime device filename */ + char logname[MAXNAMELEN+1]; /* journal device filename */ + char mtpt[MAXNAMELEN+1]; /* filesystem mount point */ + int sunit; /* stripe unit (BBs) */ + int swidth; /* stripe width (BBs), multiple of sunit */ + uchar_t iosizelog; /* log2 of the preferred I/O size */ + int ihashsize; /* inode hash table size (buckets) */ +}; + +/* + * XFS mount option flags + */ +#define XFSMNT_CHKLOG 0x00000001 /* check log */ +#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount + * compatible */ +#define XFSMNT_INO64 0x00000004 /* move inode numbers up + * past 2^32 */ +#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */ +#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */ +#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit + * enforcement */ +#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit + * enforcement */ +#define XFSMNT_NOATIME 0x00000100 /* don't modify access + * times on reads */ +#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at + * stripe boundaries*/ +#define XFSMNT_RETERR 0x00000400 /* return error to user */ +#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies + * read-only mount */ +#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */ +#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ +#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ + /* (osyncisdsync is now default) */ +#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32 + * bits of address space */ +#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ +#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit + * enforcement */ +#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ +#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ +#define XFSMNT_NOLOGFLUSH 0x04000000 /* Don't flush for log blocks */ +#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */ +#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width + * allocation */ +#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */ +#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename + * symlink,mkdir,rmdir,mknod */ + +#endif /* __XFS_CLNT_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c new file mode 100644 index 00000000000..d7fe2886676 --- /dev/null +++ b/fs/xfs/xfs_da_btree.c @@ -0,0 +1,2648 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_dir_leaf.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_error.h" +#include "xfs_bit.h" + +/* + * xfs_da_btree.c + * + * Routines to implement directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_da_root_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_root, + xfs_da_state_blk_t *new_child); +STATIC int xfs_da_node_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_blk, + xfs_da_state_blk_t *split_blk, + xfs_da_state_blk_t *blk_to_add, + int treelevel, + int *result); +STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *node_blk_1, + xfs_da_state_blk_t *node_blk_2); +STATIC void xfs_da_node_add(xfs_da_state_t *state, + xfs_da_state_blk_t *old_node_blk, + xfs_da_state_blk_t *new_node_blk); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_da_root_join(xfs_da_state_t *state, + xfs_da_state_blk_t *root_blk); +STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da_node_remove(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk); +STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, + xfs_da_state_blk_t *src_node_blk, + xfs_da_state_blk_t *dst_node_blk); + +/* + * Utility routines. + */ +STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); +STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); +STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of an intermediate node. + */ +int +xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork) +{ + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int error; + xfs_trans_t *tp; + + tp = args->trans; + error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + node->hdr.info.forw = 0; + node->hdr.info.back = 0; + INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC); + node->hdr.info.pad = 0; + node->hdr.count = 0; + INT_SET(node->hdr.level, ARCH_CONVERT, level); + + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + *bpp = bp; + return(0); +} + +/* + * Split a leaf node, rebalance, then possibly split + * intermediate nodes, rebalance, etc. + */ +int /* error */ +xfs_da_split(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *oldblk, *newblk, *addblk; + xfs_da_intnode_t *node; + xfs_dabuf_t *bp; + int max, action, error, i; + + /* + * Walk back up the tree splitting/inserting/adjusting as necessary. + * If we need to insert and there isn't room, split the node, then + * decide which fragment to insert the new block from below into. + * Note that we may split the root this way, but we need more fixup. + */ + max = state->path.active - 1; + ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH)); + ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC || + state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp)); + + addblk = &state->path.blk[max]; /* initial dummy value */ + for (i = max; (i >= 0) && addblk; state->path.active--, i--) { + oldblk = &state->path.blk[i]; + newblk = &state->altpath.blk[i]; + + /* + * If a leaf node then + * Allocate a new leaf node, then rebalance across them. + * else if an intermediate node then + * We split on the last layer, must we split the node? + */ + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: +#ifndef __KERNEL__ + return(ENOTTY); +#else + error = xfs_attr_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != ENOSPC)) { + return(error); /* GROT: attr is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + error = xfs_attr_leaf_split(state, oldblk, + &state->extrablk); + } else { + state->extraafter = 1; /* after newblk */ + error = xfs_attr_leaf_split(state, newblk, + &state->extrablk); + } + if (error) + return(error); /* GROT: attr inconsistent */ + addblk = newblk; + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + error = xfs_dir_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != ENOSPC)) { + return(error); /* GROT: dir is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + error = xfs_dir_leaf_split(state, oldblk, + &state->extrablk); + if (error) + return(error); /* GROT: dir incon. */ + addblk = newblk; + } else { + state->extraafter = 1; /* after newblk */ + error = xfs_dir_leaf_split(state, newblk, + &state->extrablk); + if (error) + return(error); /* GROT: dir incon. */ + addblk = newblk; + } + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + error = xfs_dir2_leafn_split(state, oldblk, newblk); + if (error) + return error; + addblk = newblk; + break; + case XFS_DA_NODE_MAGIC: + error = xfs_da_node_split(state, oldblk, newblk, addblk, + max - i, &action); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + if (error) + return(error); /* GROT: dir is inconsistent */ + /* + * Record the newly split block for the next time thru? + */ + if (action) + addblk = newblk; + else + addblk = NULL; + break; + } + + /* + * Update the btree to show the new hashval for this child. + */ + xfs_da_fixhashpath(state, &state->path); + /* + * If we won't need this block again, it's getting dropped + * from the active path by the loop control, so we need + * to mark it done now. + */ + if (i > 0 || !addblk) + xfs_da_buf_done(oldblk->bp); + } + if (!addblk) + return(0); + + /* + * Split the root node. + */ + ASSERT(state->path.active == 0); + oldblk = &state->path.blk[0]; + error = xfs_da_root_split(state, oldblk, addblk); + if (error) { + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(error); /* GROT: dir is inconsistent */ + } + + /* + * Update pointers to the node which used to be block 0 and + * just got bumped because of the addition of a new root node. + * There might be three blocks involved if a double split occurred, + * and the original block 0 could be at any position in the list. + */ + + node = oldblk->bp->data; + if (node->hdr.info.forw) { + if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + node = oldblk->bp->data; + if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) { + if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->data; + INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno); + xfs_da_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + xfs_da_buf_done(oldblk->bp); + xfs_da_buf_done(addblk->bp); + addblk->bp = NULL; + return(0); +} + +/* + * Split the root. We have to create a new root and point to the two + * parts (the split old root) that we just created. Copy block zero to + * the EOF, extending the inode in process. + */ +STATIC int /* error */ +xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node, *oldroot; + xfs_da_args_t *args; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + int error, size; + xfs_inode_t *dp; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_dir2_leaf_t *leaf; + + /* + * Copy the existing (incorrect) block from the root node position + * to a free space somewhere. + */ + args = state->args; + ASSERT(args != NULL); + error = xfs_da_grow_inode(args, &blkno); + if (error) + return(error); + dp = args->dp; + tp = args->trans; + mp = state->mp; + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + node = bp->data; + oldroot = blk1->bp->data; + if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] - + (char *)oldroot); + } else { + ASSERT(XFS_DIR_IS_V2(mp)); + ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + leaf = (xfs_dir2_leaf_t *)oldroot; + size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] - + (char *)leaf); + } + memcpy(node, oldroot, size); + xfs_da_log_buf(tp, bp, 0, size - 1); + xfs_da_buf_done(blk1->bp); + blk1->bp = bp; + blk1->blkno = blkno; + + /* + * Set up the new root node. + */ + error = xfs_da_node_create(args, + args->whichfork == XFS_DATA_FORK && + XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0, + INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork); + if (error) + return(error); + node = bp->data; + INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval); + INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno); + INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval); + INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno); + INT_SET(node->hdr.count, ARCH_CONVERT, 2); + +#ifdef DEBUG + if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(blk1->blkno >= mp->m_dirleafblk && + blk1->blkno < mp->m_dirfreeblk); + ASSERT(blk2->blkno >= mp->m_dirleafblk && + blk2->blkno < mp->m_dirfreeblk); + } +#endif + + /* Header is already logged by xfs_da_node_create */ + xfs_da_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, node->btree, + sizeof(xfs_da_node_entry_t) * 2)); + xfs_da_buf_done(bp); + + return(0); +} + +/* + * Split the node, rebalance, then add the new entry. + */ +STATIC int /* error */ +xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk, + xfs_da_state_blk_t *addblk, + int treelevel, int *result) +{ + xfs_da_intnode_t *node; + xfs_dablk_t blkno; + int newcount, error; + int useextra; + + node = oldblk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + + /* + * With V2 the extra block is data or freespace. + */ + useextra = state->extravalid && XFS_DIR_IS_V1(state->mp); + newcount = 1 + useextra; + /* + * Do we have to split the node? + */ + if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > state->node_ents) { + /* + * Allocate a new node, add to the doubly linked chain of + * nodes, then move some of our excess entries into it. + */ + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return(error); /* GROT: dir is inconsistent */ + + error = xfs_da_node_create(state->args, blkno, treelevel, + &newblk->bp, state->args->whichfork); + if (error) + return(error); /* GROT: dir is inconsistent */ + newblk->blkno = blkno; + newblk->magic = XFS_DA_NODE_MAGIC; + xfs_da_node_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + *result = 1; + } else { + *result = 0; + } + + /* + * Insert the new entry(s) into the correct block + * (updating last hashval in the process). + * + * xfs_da_node_add() inserts BEFORE the given index, + * and as a result of using node_lookup_int() we always + * point to a valid entry (not after one), but a split + * operation always results in a new block whose hashvals + * FOLLOW the current block. + * + * If we had double-split op below us, then add the extra block too. + */ + node = oldblk->bp->data; + if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) { + oldblk->index++; + xfs_da_node_add(state, oldblk, addblk); + if (useextra) { + if (state->extraafter) + oldblk->index++; + xfs_da_node_add(state, oldblk, &state->extrablk); + state->extravalid = 0; + } + } else { + newblk->index++; + xfs_da_node_add(state, newblk, addblk); + if (useextra) { + if (state->extraafter) + newblk->index++; + xfs_da_node_add(state, newblk, &state->extrablk); + state->extravalid = 0; + } + } + + return(0); +} + +/* + * Balance the btree elements between two intermediate nodes, + * usually one full and one empty. + * + * NOTE: if blk2 is empty, then it will get the upper half of blk1. + */ +STATIC void +xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_intnode_t *node1, *node2, *tmpnode; + xfs_da_node_entry_t *btree_s, *btree_d; + int count, tmp; + xfs_trans_t *tp; + + node1 = blk1->bp->data; + node2 = blk2->bp->data; + /* + * Figure out how many entries need to move, and in which direction. + * Swap the nodes around if that makes it simpler. + */ + if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) && + ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { + tmpnode = node1; + node1 = node2; + node2 = tmpnode; + } + ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2; + if (count == 0) + return; + tp = state->args->trans; + /* + * Two cases: high-to-low and low-to-high. + */ + if (count > 0) { + /* + * Move elements in node2 up to make a hole. + */ + if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) { + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node2->btree[count]; + memmove(btree_d, btree_s, tmp); + } + + /* + * Move the req'd B-tree elements from high in node1 to + * low in node2. + */ + INT_MOD(node2->hdr.count, ARCH_CONVERT, count); + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count]; + btree_d = &node2->btree[0]; + memcpy(btree_d, btree_s, tmp); + INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count)); + + } else { + /* + * Move the req'd B-tree elements from low in node2 to + * high in node1. + */ + count = -count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[0]; + btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)]; + memcpy(btree_d, btree_s, tmp); + INT_MOD(node1->hdr.count, ARCH_CONVERT, count); + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, btree_d, tmp)); + + /* + * Move elements in node2 down to fill the hole. + */ + tmp = INT_GET(node2->hdr.count, ARCH_CONVERT) - count; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &node2->btree[count]; + btree_d = &node2->btree[0]; + memmove(btree_d, btree_s, tmp); + INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count)); + } + + /* + * Log header of node 1 and all current bits of node 2. + */ + xfs_da_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + xfs_da_log_buf(tp, blk2->bp, + XFS_DA_LOGRANGE(node2, &node2->hdr, + sizeof(node2->hdr) + + sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT))); + + /* + * Record the last hashval from each block for upward propagation. + * (note: don't use the swapped node pointers) + */ + node1 = blk1->bp->data; + node2 = blk2->bp->data; + blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + + /* + * Adjust the expected index for insertion. + */ + if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) { + blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT); + blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1; /* make it invalid */ + } +} + +/* + * Add a new entry to an intermediate node. + */ +STATIC void +xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + xfs_mount_t *mp; + + node = oldblk->bp->data; + mp = state->mp; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT))); + ASSERT(newblk->blkno != 0); + if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + ASSERT(newblk->blkno >= mp->m_dirleafblk && + newblk->blkno < mp->m_dirfreeblk); + + /* + * We may need to make some room before we insert the new node. + */ + tmp = 0; + btree = &node->btree[ oldblk->index ]; + if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) { + tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree); + memmove(btree + 1, btree, tmp); + } + INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval); + INT_SET(btree->before, ARCH_CONVERT, newblk->blkno); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); + INT_MOD(node->hdr.count, ARCH_CONVERT, +1); + xfs_da_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the oldblk to propagate upwards. + */ + oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Deallocate an empty leaf node, remove it from its parent, + * possibly deallocating that block, etc... + */ +int +xfs_da_join(xfs_da_state_t *state) +{ + xfs_da_state_blk_t *drop_blk, *save_blk; + int action, error; + + action = 0; + drop_blk = &state->path.blk[ state->path.active-1 ]; + save_blk = &state->altpath.blk[ state->path.active-1 ]; + ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC || + drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp)); + + /* + * Walk back up the tree joining/deallocating as necessary. + * When we stop dropping blocks, break out. + */ + for ( ; state->path.active >= 2; drop_blk--, save_blk--, + state->path.active--) { + /* + * See if we can combine the block with a neighbor. + * (action == 0) => no options, just leave + * (action == 1) => coalesce, then unlink + * (action == 2) => block empty, unlink it + */ + switch (drop_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: +#ifndef __KERNEL__ + error = ENOTTY; +#else + error = xfs_attr_leaf_toosmall(state, &action); +#endif + if (error) + return(error); + if (action == 0) + return(0); +#ifdef __KERNEL__ + xfs_attr_leaf_unbalance(state, drop_blk, save_blk); +#endif + break; + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + error = xfs_dir_leaf_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return(0); + xfs_dir_leaf_unbalance(state, drop_blk, save_blk); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + error = xfs_dir2_leafn_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_dir2_leafn_unbalance(state, drop_blk, save_blk); + break; + case XFS_DA_NODE_MAGIC: + /* + * Remove the offending node, fixup hashvals, + * check for a toosmall neighbor. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_node_toosmall(state, &action); + if (error) + return(error); + if (action == 0) + return 0; + xfs_da_node_unbalance(state, drop_blk, save_blk); + break; + } + xfs_da_fixhashpath(state, &state->altpath); + error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da_state_kill_altpath(state); + if (error) + return(error); + error = xfs_da_shrink_inode(state->args, drop_blk->blkno, + drop_blk->bp); + drop_blk->bp = NULL; + if (error) + return(error); + } + /* + * We joined all the way to the top. If it turns out that + * we only have one entry in the root, make the child block + * the new root. + */ + xfs_da_node_remove(state, drop_blk); + xfs_da_fixhashpath(state, &state->path); + error = xfs_da_root_join(state, &state->path.blk[0]); + return(error); +} + +/* + * We have only one entry in the root. Copy the only remaining child of + * the old root to block 0 as the new root node. + */ +STATIC int +xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +{ + xfs_da_intnode_t *oldroot; + /* REFERENCED */ + xfs_da_blkinfo_t *blkinfo; + xfs_da_args_t *args; + xfs_dablk_t child; + xfs_dabuf_t *bp; + int error; + + args = state->args; + ASSERT(args != NULL); + ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + oldroot = root_blk->bp->data; + ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(!oldroot->hdr.info.forw); + ASSERT(!oldroot->hdr.info.back); + + /* + * If the root has more than one child, then don't do anything. + */ + if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1) + return(0); + + /* + * Read in the (only) child block, then copy those bytes into + * the root block's buffer and free the original child block. + */ + child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT); + ASSERT(child != 0); + error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + blkinfo = bp->data; + if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) { + ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + } else { + ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + } + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); + memcpy(root_blk->bp->data, bp->data, state->blocksize); + xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + error = xfs_da_shrink_inode(args, child, bp); + return(error); +} + +/* + * Check a node block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +STATIC int +xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_da_intnode_t *node; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + node = (xfs_da_intnode_t *)info; + count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (count > (state->node_ents >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return(0); /* blk over 50%, don't try to join */ + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (aribtrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = info->forw; + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + /* start with smaller blk num */ + forward = (INT_GET(info->forw, ARCH_CONVERT) + < INT_GET(info->back, ARCH_CONVERT)); + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = INT_GET(info->forw, ARCH_CONVERT); + else + blkno = INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + + node = (xfs_da_intnode_t *)info; + count = state->node_ents; + count -= state->node_ents >> 2; + count -= INT_GET(node->hdr.count, ARCH_CONVERT); + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + count -= INT_GET(node->hdr.count, ARCH_CONVERT); + xfs_da_brelse(state->args->trans, bp); + if (count >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return(0); + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + if (error) { + return(error); + } + if (retval) { + *action = 0; + return(0); + } + } + *action = 1; + return(0); +} + +/* + * Walk back up the tree adjusting hash values as necessary, + * when we stop making changes, return. + */ +void +xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +{ + xfs_da_state_blk_t *blk; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dahash_t lasthash=0; + int level, count; + + level = path->active-1; + blk = &path->blk[ level ]; + switch (blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + lasthash = xfs_attr_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + lasthash = xfs_dir_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DA_NODE_MAGIC: + lasthash = xfs_da_node_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + } + for (blk--, level--; level >= 0; blk--, level--) { + node = blk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + btree = &node->btree[ blk->index ]; + if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash) + break; + blk->hashval = lasthash; + INT_SET(btree->hashval, ARCH_CONVERT, lasthash); + xfs_da_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + + lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + } +} + +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + int tmp; + + node = drop_blk->bp->data; + ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)); + ASSERT(drop_blk->index >= 0); + + /* + * Copy over the offending entry, or just zero it out. + */ + btree = &node->btree[drop_blk->index]; + if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) { + tmp = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + memmove(btree, btree + 1, tmp); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, tmp)); + btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ]; + } + memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + INT_MOD(node->hdr.count, ARCH_CONVERT, -1); + xfs_da_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + + /* + * Copy the last hash value from the block to propagate upwards. + */ + btree--; + drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT); +} + +/* + * Unbalance the btree elements between two intermediate nodes, + * move all Btree elements from one node into another. + */ +STATIC void +xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_intnode_t *drop_node, *save_node; + xfs_da_node_entry_t *btree; + int tmp; + xfs_trans_t *tp; + + drop_node = drop_blk->bp->data; + save_node = save_blk->bp->data; + ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + tp = state->args->trans; + + /* + * If the dying block has lower hashvals, then move all the + * elements in the remaining block up to make a hole. + */ + if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT))) + { + btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ]; + tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t); + memmove(btree, &save_node->btree[0], tmp); + btree = &save_node->btree[0]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + (INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) * + sizeof(xfs_da_node_entry_t))); + } else { + btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ]; + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, btree, + INT_GET(drop_node->hdr.count, ARCH_CONVERT) * + sizeof(xfs_da_node_entry_t))); + } + + /* + * Move all the B-tree elements from drop_blk to save_blk. + */ + tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t); + memcpy(btree, &drop_node->btree[0], tmp); + INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT)); + + xfs_da_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_node->hdr, + sizeof(save_node->hdr))); + + /* + * Save the last hashval in the remaining block for upward propagation. + */ + save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Walk down the Btree looking for a particular filename, filling + * in the state structure as we go. + * + * We will set the state structure to point to each of the elements + * in each of the nodes where either the hashval is or should be. + * + * We support duplicate hashval's so for each entry in the current + * node that could contain the desired hashval, descend. This is a + * pruned depth-first tree search. + */ +int /* error */ +xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *curr; + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dablk_t blkno; + int probe, span, max, error, retval; + xfs_dahash_t hashval; + xfs_da_args_t *args; + + args = state->args; + + /* + * Descend thru the B-tree searching each level for the right + * node to use, until the right hashval is found. + */ + if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp)) + blkno = state->mp->m_dirleafblk; + else + blkno = 0; + for (blk = &state->path.blk[0], state->path.active = 1; + state->path.active <= XFS_DA_NODE_MAXDEPTH; + blk++, state->path.active++) { + /* + * Read the next node down in the tree. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); + if (error) { + blk->blkno = 0; + state->path.active--; + return(error); + } + curr = blk->bp->data; + ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC || + INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + + /* + * Search an intermediate node for a match. + */ + blk->magic = INT_GET(curr->magic, ARCH_CONVERT); + if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + node = blk->bp->data; + blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + + /* + * Binary search. (note: small blocks will skip loop) + */ + max = INT_GET(node->hdr.count, ARCH_CONVERT); + probe = span = max / 2; + hashval = args->hashval; + for (btree = &node->btree[probe]; span > 4; + btree = &node->btree[probe]) { + span /= 2; + if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval) + probe += span; + else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) { + btree--; + probe--; + } + while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) { + btree++; + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max-1; + blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT); + } else { + blk->index = probe; + blkno = INT_GET(btree->before, ARCH_CONVERT); + } + } +#ifdef __KERNEL__ + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) { + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } +#endif + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) { + blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL); + break; + } + else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) { + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + } + + /* + * A leaf block that ends in the hashval that we are interested in + * (final hashval == search hashval) means that the next block may + * contain more entries with the same hashval, shift upward to the + * next leaf and keep searching. + */ + for (;;) { + if (blk->magic == XFS_DIR_LEAF_MAGIC) { + ASSERT(XFS_DIR_IS_V1(state->mp)); + retval = xfs_dir_leaf_lookup_int(blk->bp, args, + &blk->index); + } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(XFS_DIR_IS_V2(state->mp)); + retval = xfs_dir2_leafn_lookup_int(blk->bp, args, + &blk->index, state); + } +#ifdef __KERNEL__ + else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr_leaf_lookup_int(blk->bp, args); + blk->index = args->index; + args->blkno = blk->blkno; + } +#endif + if (((retval == ENOENT) || (retval == ENOATTR)) && + (blk->hashval == args->hashval)) { + error = xfs_da_path_shift(state, &state->path, 1, 1, + &retval); + if (error) + return(error); + if (retval == 0) { + continue; + } +#ifdef __KERNEL__ + else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + /* path_shift() gives ENOENT */ + retval = XFS_ERROR(ENOATTR); + } +#endif + } + break; + } + *result = retval; + return(0); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Link a new block into a doubly linked list of blocks (of whatever type). + */ +int /* error */ +xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk) +{ + xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; + xfs_da_args_t *args; + int before=0, error; + xfs_dabuf_t *bp; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + old_info = old_blk->bp->data; + new_info = new_blk->bp->data; + ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || + old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) || + old_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT)); + ASSERT(old_blk->magic == new_blk->magic); + + switch (old_blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + break; + case XFS_DA_NODE_MAGIC: + before = xfs_da_node_order(old_blk->bp, new_blk->bp); + break; + } + + /* + * Link blocks in appropriate order. + */ + if (before) { + /* + * Link new block in before existing block. + */ + INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno); + new_info->back = old_info->back; /* INT_: direct copy */ + if (INT_GET(old_info->back, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(old_info->back, + ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno); + INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno); + } else { + /* + * Link new block in after existing block. + */ + new_info->forw = old_info->forw; /* INT_: direct copy */ + INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno); + if (INT_GET(old_info->forw, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) + == INT_GET(old_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT) + == old_blk->blkno); + INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + xfs_da_buf_done(bp); + } + INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno); + } + + xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + return(0); +} + +/* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) +{ + xfs_da_intnode_t *node1, *node2; + + node1 = node1_bp->data; + node2 = node2_bp->data; + ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) && + (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC)); + if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) && + ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_da_intnode_t *node; + + node = bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + if (count) + *count = INT_GET(node->hdr.count, ARCH_CONVERT); + if (!node->hdr.count) + return(0); + return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)); +} + +/* + * Unlink a block from a doubly linked list of blocks. + */ +int /* error */ +xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; + xfs_da_args_t *args; + xfs_dabuf_t *bp; + int error; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + save_info = save_blk->bp->data; + drop_info = drop_blk->bp->data; + ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || + save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) || + save_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT)); + ASSERT(save_blk->magic == drop_blk->magic); + ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) || + (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno)); + ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) || + (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno)); + + /* + * Unlink the leaf block from the doubly linked chain of leaves. + */ + if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) { + save_info->back = drop_info->back; /* INT_: direct copy */ + if (INT_GET(drop_info->back, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(drop_info->back, + ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno); + INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } else { + save_info->forw = drop_info->forw; /* INT_: direct copy */ + if (INT_GET(drop_info->forw, ARCH_CONVERT)) { + error = xfs_da_read_buf(args->trans, args->dp, + INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp, + args->whichfork); + if (error) + return(error); + ASSERT(bp != NULL); + tmp_info = bp->data; + ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) + == INT_GET(save_info->magic, ARCH_CONVERT)); + ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT) + == drop_blk->blkno); + INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno); + xfs_da_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + xfs_da_buf_done(bp); + } + } + + xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + return(0); +} + +/* + * Move a path "forward" or "!forward" one block at the current level. + * + * This routine will adjust a "path" to point to the next block + * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the + * Btree, including updating pointers to the intermediate nodes between + * the new bottom and the root. + */ +int /* error */ +xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result) +{ + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_da_args_t *args; + xfs_dablk_t blkno=0; + int level, error; + + /* + * Roll up the Btree looking for the first block where our + * current index is not at the edge of the block. Note that + * we skip the bottom layer because we want the sibling block. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(path != NULL); + ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + level = (path->active-1) - 1; /* skip bottom layer in path */ + for (blk = &path->blk[level]; level >= 0; blk--, level--) { + ASSERT(blk->bp != NULL); + node = blk->bp->data; + ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) { + blk->index++; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + break; + } else if (!forward && (blk->index > 0)) { + blk->index--; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + break; + } + } + if (level < 0) { + *result = XFS_ERROR(ENOENT); /* we're out of our tree */ + ASSERT(args->oknoent); + return(0); + } + + /* + * Roll down the edge of the subtree until we reach the + * same depth we were at originally. + */ + for (blk++, level++; level < path->active; blk++, level++) { + /* + * Release the old block. + * (if it's dirty, trans won't actually let go) + */ + if (release) + xfs_da_brelse(args->trans, blk->bp); + + /* + * Read the next child block. + */ + blk->blkno = blkno; + error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); + if (error) + return(error); + ASSERT(blk->bp != NULL); + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC || + INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) || + INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC); + blk->magic = INT_GET(info->magic, ARCH_CONVERT); + if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) { + node = (xfs_da_intnode_t *)info; + blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + if (forward) + blk->index = 0; + else + blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1; + blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT); + } else { + ASSERT(level == path->active-1); + blk->index = 0; + switch(blk->magic) { +#ifdef __KERNEL__ + case XFS_ATTR_LEAF_MAGIC: + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; +#endif + case XFS_DIR_LEAF_MAGIC: + ASSERT(XFS_DIR_IS_V1(state->mp)); + blk->hashval = xfs_dir_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + ASSERT(XFS_DIR_IS_V2(state->mp)); + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == + XFS_DIRX_LEAF_MAGIC(state->mp)); + break; + } + } + } + *result = 0; + return(0); +} + + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Implement a simple hash on a character string. + * Rotate the hash value by 7 bits, then XOR each character in. + * This is implemented with some source-level loop unrolling. + */ +xfs_dahash_t +xfs_da_hashname(uchar_t *name, int namelen) +{ + xfs_dahash_t hash; + +#ifdef SLOWVERSION + /* + * This is the old one-byte-at-a-time version. + */ + for (hash = 0; namelen > 0; namelen--) + hash = *name++ ^ rol32(hash, 7); + + return(hash); +#else + /* + * Do four characters at a time as long as we can. + */ + for (hash = 0; namelen >= 4; namelen -= 4, name += 4) + hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^ + (name[3] << 0) ^ rol32(hash, 7 * 4); + + /* + * Now do the rest of the characters. + */ + switch (namelen) { + case 3: + return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^ + rol32(hash, 7 * 3); + case 2: + return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2); + case 1: + return (name[0] << 0) ^ rol32(hash, 7 * 1); + case 0: + return hash; + } + /* NOTREACHED */ +#endif + return 0; /* keep gcc happy */ +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno, b; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_inode_t *dp; + int nmap, error, w, count, c, got, i, mapi; + xfs_fsize_t size; + xfs_trans_t *tp; + xfs_mount_t *mp; + + dp = args->dp; + mp = dp->i_mount; + w = args->whichfork; + tp = args->trans; + /* + * For new directories adjust the file offset and block count. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) { + bno = mp->m_dirleafblk; + count = mp->m_dirblkfsbs; + } else { + bno = 0; + count = 1; + } + /* + * Find a spot in the file space to put the new block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) { + return error; + } + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk); + /* + * Try mapping it in one filesystem block. + */ + nmap = 1; + ASSERT(args->firstblock != NULL); + if ((error = xfs_bmapi(tp, dp, bno, count, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| + XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) { + return error; + } + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ + else if (nmap == 0 && count > 1) { + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + for (b = bno, mapi = 0; b < bno + count; ) { + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| + XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp, sizeof(*mapp) * count); + return error; + } + if (nmap < 1) + break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } else { + mapi = 0; + mapp = NULL; + } + /* + * Count the blocks we got, make sure it matches the total. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + return XFS_ERROR(ENOSPC); + } + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + *new_blkno = (xfs_dablk_t)bno; + /* + * For version 1 directories, adjust the file size if it changed. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) { + ASSERT(mapi == 1); + if ((error = xfs_bmap_last_offset(tp, dp, &bno, w))) + return error; + size = XFS_FSB_TO_B(mp, bno); + if (size != dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * Ick. We need to always be able to remove a btree block, even + * if there's no space reservation because the filesystem is full. + * This is called if xfs_bunmapi on a btree block fails due to ENOSPC. + * It swaps the target block with the last block in the file. The + * last block in the file can always be removed since it can't cause + * a bmap btree split to do that. + */ +STATIC int +xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, + xfs_dabuf_t **dead_bufp) +{ + xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; + xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; + xfs_fileoff_t lastoff; + xfs_inode_t *ip; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error, w, entno, level, dead_level; + xfs_da_blkinfo_t *dead_info, *sib_info; + xfs_da_intnode_t *par_node, *dead_node; + xfs_dir_leafblock_t *dead_leaf; + xfs_dir2_leaf_t *dead_leaf2; + xfs_dahash_t dead_hash; + + dead_buf = *dead_bufp; + dead_blkno = *dead_blknop; + tp = args->trans; + ip = args->dp; + w = args->whichfork; + ASSERT(w == XFS_DATA_FORK); + mp = ip->i_mount; + if (XFS_DIR_IS_V2(mp)) { + lastoff = mp->m_dirfreeblk; + error = xfs_bmap_last_before(tp, ip, &lastoff, w); + } else + error = xfs_bmap_last_offset(tp, ip, &lastoff, w); + if (error) + return error; + if (unlikely(lastoff == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + /* + * Read the last block in the btree space. + */ + last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; + if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + return error; + /* + * Copy the last block into the dead buffer and log it. + */ + memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); + xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + dead_info = dead_buf->data; + /* + * Get values from the moved block. + */ + if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) { + ASSERT(XFS_DIR_IS_V1(mp)); + dead_leaf = (xfs_dir_leafblock_t *)dead_info; + dead_level = 0; + dead_hash = + INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) { + ASSERT(XFS_DIR_IS_V2(mp)); + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dead_level = 0; + dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } else { + ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC); + dead_node = (xfs_da_intnode_t *)dead_info; + dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT); + dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + } + sib_buf = par_buf = NULL; + /* + * If the moved block has a left sibling, fix up the pointers. + */ + if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if (unlikely( + INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno || + INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->forw, + sizeof(sib_info->forw))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + /* + * If the moved block has a right sibling, fix up the pointers. + */ + if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) { + if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + goto done; + sib_info = sib_buf->data; + if (unlikely( + INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno + || INT_GET(sib_info->magic, ARCH_CONVERT) + != INT_GET(dead_info->magic, ARCH_CONVERT))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->back, + sizeof(sib_info->back))); + xfs_da_buf_done(sib_buf); + sib_buf = NULL; + } + par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk; + level = -1; + /* + * Walk down the tree looking for the parent of the moved block. + */ + for (;;) { + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (unlikely( + INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC || + (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + level = INT_GET(par_node->hdr.level, ARCH_CONVERT); + for (entno = 0; + entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) && + INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash; + entno++) + continue; + if (unlikely(entno == INT_GET(par_node->hdr.count, ARCH_CONVERT))) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT); + if (level == dead_level + 1) + break; + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + } + /* + * We're in the right parent block. + * Look for the right entry. + */ + for (;;) { + for (; + entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) && + INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno; + entno++) + continue; + if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT)) + break; + par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT); + xfs_da_brelse(tp, par_buf); + par_buf = NULL; + if (unlikely(par_blkno == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + goto done; + par_node = par_buf->data; + if (unlikely( + INT_GET(par_node->hdr.level, ARCH_CONVERT) != level || + INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", + XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto done; + } + entno = 0; + } + /* + * Update the parent entry pointing to the moved block. + */ + INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno); + xfs_da_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, + sizeof(par_node->btree[entno].before))); + xfs_da_buf_done(par_buf); + xfs_da_buf_done(dead_buf); + *dead_blknop = last_blkno; + *dead_bufp = last_buf; + return 0; +done: + if (par_buf) + xfs_da_brelse(tp, par_buf); + if (sib_buf) + xfs_da_brelse(tp, sib_buf); + xfs_da_brelse(tp, last_buf); + return error; +} + +/* + * Remove a btree block from a directory or attribute. + */ +int +xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf) +{ + xfs_inode_t *dp; + int done, error, w, count; + xfs_fileoff_t bno; + xfs_fsize_t size; + xfs_trans_t *tp; + xfs_mount_t *mp; + + dp = args->dp; + w = args->whichfork; + tp = args->trans; + mp = dp->i_mount; + if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + count = mp->m_dirblkfsbs; + else + count = 1; + for (;;) { + /* + * Remove extents. If we get ENOSPC for a dir we have to move + * the last block to the place we want to kill. + */ + if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, + XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, + &done)) == ENOSPC) { + if (w != XFS_DATA_FORK) + goto done; + if ((error = xfs_da_swap_lastblock(args, &dead_blkno, + &dead_buf))) + goto done; + } else if (error) + goto done; + else + break; + } + ASSERT(done); + xfs_da_binval(tp, dead_buf); + /* + * Adjust the directory size for version 1. + */ + if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) { + if ((error = xfs_bmap_last_offset(tp, dp, &bno, w))) + return error; + size = XFS_FSB_TO_B(dp->i_mount, bno); + if (size != dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +done: + xfs_da_binval(tp, dead_buf); + return error; +} + +/* + * See if the mapping(s) for this btree block are valid, i.e. + * don't contain holes, are logically contiguous, and cover the whole range. + */ +STATIC int +xfs_da_map_covers_blocks( + int nmap, + xfs_bmbt_irec_t *mapp, + xfs_dablk_t bno, + int count) +{ + int i; + xfs_fileoff_t off; + + for (i = 0, off = bno; i < nmap; i++) { + if (mapp[i].br_startblock == HOLESTARTBLOCK || + mapp[i].br_startblock == DELAYSTARTBLOCK) { + return 0; + } + if (off != mapp[i].br_startoff) { + return 0; + } + off += mapp[i].br_blockcount; + } + return off == bno + count; +} + +/* + * Make a dabuf. + * Used for get_buf, read_buf, read_bufr, and reada_buf. + */ +STATIC int +xfs_da_do_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t *mappedbnop, + xfs_dabuf_t **bpp, + int whichfork, + int caller, + inst_t *ra) +{ + xfs_buf_t *bp = NULL; + xfs_buf_t **bplist; + int error=0; + int i; + xfs_bmbt_irec_t map; + xfs_bmbt_irec_t *mapp; + xfs_daddr_t mappedbno; + xfs_mount_t *mp; + int nbplist=0; + int nfsb; + int nmap; + xfs_dabuf_t *rbp; + + mp = dp->i_mount; + if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) + nfsb = mp->m_dirblkfsbs; + else + nfsb = 1; + mappedbno = *mappedbnop; + /* + * Caller doesn't have a mapping. -2 means don't complain + * if we land in a hole. + */ + if (mappedbno == -1 || mappedbno == -2) { + /* + * Optimize the one-block case. + */ + if (nfsb == 1) { + xfs_fsblock_t fsb; + + if ((error = + xfs_bmapi_single(trans, dp, whichfork, &fsb, + (xfs_fileoff_t)bno))) { + return error; + } + mapp = ↦ + if (fsb == NULLFSBLOCK) { + nmap = 0; + } else { + map.br_startblock = fsb; + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = 1; + nmap = 1; + } + } else { + mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); + nmap = nfsb; + if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, + nfsb, + XFS_BMAPI_METADATA | + XFS_BMAPI_AFLAG(whichfork), + NULL, 0, mapp, &nmap, NULL))) + goto exit0; + } + } else { + map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + map.br_startoff = (xfs_fileoff_t)bno; + map.br_blockcount = nfsb; + mapp = ↦ + nmap = 1; + } + if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { + error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + if (unlikely(error == EFSCORRUPTED)) { + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; + cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", + (long long)bno); + cmn_err(CE_ALERT, "dir: inode %lld\n", + (long long)dp->i_ino); + for (i = 0; i < nmap; i++) { + cmn_err(CE_ALERT, + "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", + i, + (long long)mapp[i].br_startoff, + (long long)mapp[i].br_startblock, + (long long)mapp[i].br_blockcount, + mapp[i].br_state); + } + } + XFS_ERROR_REPORT("xfs_da_do_buf(1)", + XFS_ERRLEVEL_LOW, mp); + } + goto exit0; + } + if (caller != 3 && nmap > 1) { + bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); + nbplist = 0; + } else + bplist = NULL; + /* + * Turn the mapping(s) into buffer(s). + */ + for (i = 0; i < nmap; i++) { + int nmapped; + + mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); + if (i == 0) + *mappedbnop = mappedbno; + nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); + switch (caller) { + case 0: + bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, + mappedbno, nmapped, 0); + error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + break; + case 1: +#ifndef __KERNEL__ + case 2: +#endif + bp = NULL; + error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, + mappedbno, nmapped, 0, &bp); + break; +#ifdef __KERNEL__ + case 3: + xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); + error = 0; + bp = NULL; + break; +#endif + } + if (error) { + if (bp) + xfs_trans_brelse(trans, bp); + goto exit1; + } + if (!bp) + continue; + if (caller == 1) { + if (whichfork == XFS_ATTR_FORK) { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, + XFS_ATTR_BTREE_REF); + } else { + XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, + XFS_DIR_BTREE_REF); + } + } + if (bplist) { + bplist[nbplist++] = bp; + } + } + /* + * Build a dabuf structure. + */ + if (bplist) { + rbp = xfs_da_buf_make(nbplist, bplist, ra); + } else if (bp) + rbp = xfs_da_buf_make(1, &bp, ra); + else + rbp = NULL; + /* + * For read_buf, check the magic number. + */ + if (caller == 1) { + xfs_dir2_data_t *data; + xfs_dir2_free_t *free; + xfs_da_blkinfo_t *info; + uint magic, magic1; + + info = rbp->data; + data = rbp->data; + free = rbp->data; + magic = INT_GET(info->magic, ARCH_CONVERT); + magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT); + if (unlikely( + XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_DIR_LEAF_MAGIC) && + (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR2_DATA_MAGIC) && + (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC), + mp, XFS_ERRTAG_DA_READ_BUF, + XFS_RANDOM_DA_READ_BUF))) { + xfs_buftrace("DA READ ERROR", rbp->bps[0]); + XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", + XFS_ERRLEVEL_LOW, mp, info); + error = XFS_ERROR(EFSCORRUPTED); + xfs_da_brelse(trans, rbp); + nbplist = 0; + goto exit1; + } + } + if (bplist) { + kmem_free(bplist, sizeof(*bplist) * nmap); + } + if (mapp != &map) { + kmem_free(mapp, sizeof(*mapp) * nfsb); + } + if (bpp) + *bpp = rbp; + return 0; +exit1: + if (bplist) { + for (i = 0; i < nbplist; i++) + xfs_trans_brelse(trans, bplist[i]); + kmem_free(bplist, sizeof(*bplist) * nmap); + } +exit0: + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * nfsb); + if (bpp) + *bpp = NULL; + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0, + (inst_t *)__return_address); +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, + int whichfork) +{ + return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1, + (inst_t *)__return_address); +} + +/* + * Readahead the dir/attr block. + */ +xfs_daddr_t +xfs_da_reada_buf( + xfs_trans_t *trans, + xfs_inode_t *dp, + xfs_dablk_t bno, + int whichfork) +{ + xfs_daddr_t rval; + + rval = -1; + if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3, + (inst_t *)__return_address)) + return -1; + else + return rval; +} + +/* + * Calculate the number of bits needed to hold i different values. + */ +uint +xfs_da_log2_roundup(uint i) +{ + uint rval; + + for (rval = 0; rval < NBBY * sizeof(i); rval++) { + if ((1 << rval) >= i) + break; + } + return(rval); +} + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ +kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) { + if (state->altpath.blk[i].bp) { + if (state->altpath.blk[i].bp != state->path.blk[i].bp) + xfs_da_buf_done(state->altpath.blk[i].bp); + state->altpath.blk[i].bp = NULL; + } + } + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + int i; + + xfs_da_state_kill_altpath(state); + for (i = 0; i < state->path.active; i++) { + if (state->path.blk[i].bp) + xfs_da_buf_done(state->path.blk[i].bp); + } + if (state->extravalid && state->extrablk.bp) + xfs_da_buf_done(state->extrablk.bp); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +#ifdef XFS_DABUF_DEBUG +xfs_dabuf_t *xfs_dabuf_global_list; +lock_t xfs_dabuf_global_lock; +#endif + +/* + * Create a dabuf. + */ +/* ARGSUSED */ +STATIC xfs_dabuf_t * +xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) +{ + xfs_buf_t *bp; + xfs_dabuf_t *dabuf; + int i; + int off; + + if (nbuf == 1) + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + else + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf->dirty = 0; +#ifdef XFS_DABUF_DEBUG + dabuf->ra = ra; + dabuf->target = XFS_BUF_TARGET(bps[0]); + dabuf->blkno = XFS_BUF_ADDR(bps[0]); +#endif + if (nbuf == 1) { + dabuf->nbuf = 1; + bp = bps[0]; + dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->data = XFS_BUF_PTR(bp); + dabuf->bps[0] = bp; + } else { + dabuf->nbuf = nbuf; + for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { + dabuf->bps[i] = bp = bps[i]; + dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + } + dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); + for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = bps[i]; + memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + XFS_BUF_COUNT(bp)); + } + } +#ifdef XFS_DABUF_DEBUG + { + SPLDECL(s); + xfs_dabuf_t *p; + + s = mutex_spinlock(&xfs_dabuf_global_lock); + for (p = xfs_dabuf_global_list; p; p = p->next) { + ASSERT(p->blkno != dabuf->blkno || + p->target != dabuf->target); + } + dabuf->prev = NULL; + if (xfs_dabuf_global_list) + xfs_dabuf_global_list->prev = dabuf; + dabuf->next = xfs_dabuf_global_list; + xfs_dabuf_global_list = dabuf; + mutex_spinunlock(&xfs_dabuf_global_lock, s); + } +#endif + return dabuf; +} + +/* + * Un-dirty a dabuf. + */ +STATIC void +xfs_da_buf_clean(xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + int i; + int off; + + if (dabuf->dirty) { + ASSERT(dabuf->nbuf > 1); + dabuf->dirty = 0; + for (i = off = 0; i < dabuf->nbuf; + i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, + XFS_BUF_COUNT(bp)); + } + } +} + +/* + * Release a dabuf. + */ +void +xfs_da_buf_done(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf); + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->dirty) + xfs_da_buf_clean(dabuf); + if (dabuf->nbuf > 1) + kmem_free(dabuf->data, BBTOB(dabuf->bbcount)); +#ifdef XFS_DABUF_DEBUG + { + SPLDECL(s); + + s = mutex_spinlock(&xfs_dabuf_global_lock); + if (dabuf->prev) + dabuf->prev->next = dabuf->next; + else + xfs_dabuf_global_list = dabuf->next; + if (dabuf->next) + dabuf->next->prev = dabuf->prev; + mutex_spinunlock(&xfs_dabuf_global_lock, s); + } + memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf)); +#endif + if (dabuf->nbuf == 1) + kmem_zone_free(xfs_dabuf_zone, dabuf); + else + kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); +} + +/* + * Log transaction from a dabuf. + */ +void +xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) +{ + xfs_buf_t *bp; + uint f; + int i; + uint l; + int off; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if (dabuf->nbuf == 1) { + ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + xfs_trans_log_buf(tp, dabuf->bps[0], first, last); + return; + } + dabuf->dirty = 1; + ASSERT(first <= last); + for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + bp = dabuf->bps[i]; + f = off; + l = f + XFS_BUF_COUNT(bp) - 1; + if (f < first) + f = first; + if (l > last) + l = last; + if (f <= l) + xfs_trans_log_buf(tp, bp, f - off, l - off); + /* + * B_DONE is set by xfs_trans_log buf. + * If we don't set it on a new buffer (get not read) + * then if we don't put anything in the buffer it won't + * be set, and at commit it it released into the cache, + * and then a read will fail. + */ + else if (!(XFS_BUF_ISDONE(bp))) + XFS_BUF_DONE(bp); + } + ASSERT(last < off); +} + +/* + * Release dabuf from a transaction. + * Have to free up the dabuf before the buffers are released, + * since the synchronization on the dabuf is really the lock on the buffer. + */ +void +xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_brelse(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist, nbuf * sizeof(*bplist)); +} + +/* + * Invalidate dabuf from a transaction. + */ +void +xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +{ + xfs_buf_t *bp; + xfs_buf_t **bplist; + int i; + int nbuf; + + ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); + if ((nbuf = dabuf->nbuf) == 1) { + bplist = &bp; + bp = dabuf->bps[0]; + } else { + bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); + memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); + } + xfs_da_buf_done(dabuf); + for (i = 0; i < nbuf; i++) + xfs_trans_binval(tp, bplist[i]); + if (bplist != &bp) + kmem_free(bplist, nbuf * sizeof(*bplist)); +} + +/* + * Get the first daddr from a dabuf. + */ +xfs_daddr_t +xfs_da_blkno(xfs_dabuf_t *dabuf) +{ + ASSERT(dabuf->nbuf); + ASSERT(dabuf->data); + return XFS_BUF_ADDR(dabuf->bps[0]); +} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h new file mode 100644 index 00000000000..9fc699d9699 --- /dev/null +++ b/fs/xfs/xfs_da_btree.h @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DA_BTREE_H__ +#define __XFS_DA_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct zone; + +/*======================================================================== + * Directory Structure when greater than XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * Is is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_DIR_LEAF_MAGIC 0xfeeb /* magic number: directory leaf blks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +#define XFS_DIRX_LEAF_MAGIC(mp) \ + (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC) + +typedef struct xfs_da_blkinfo { + xfs_dablk_t forw; /* previous block in list */ + xfs_dablk_t back; /* following block in list */ + __uint16_t magic; /* validity check on block */ + __uint16_t pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active entries */ + __uint16_t level; /* level above leaves (leaf == 0) */ + } hdr; + struct xfs_da_node_entry { + xfs_dahash_t hashval; /* hash value for this descendant */ + xfs_dablk_t before; /* Btree block before this key */ + } btree[1]; /* variable sized array of keys */ +} xfs_da_intnode_t; +typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; +typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */ + +/* + * Macros used by directory code to interface to the filesystem. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE) +int xfs_lbsize(struct xfs_mount *mp); +#define XFS_LBSIZE(mp) xfs_lbsize(mp) +#else +#define XFS_LBSIZE(mp) ((mp)->m_sb.sb_blocksize) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG) +int xfs_lblog(struct xfs_mount *mp); +#define XFS_LBLOG(mp) xfs_lblog(mp) +#else +#define XFS_LBLOG(mp) ((mp)->m_sb.sb_blocklog) +#endif + +/* + * Macros used by directory code to interface to the kernel + */ + +/* + * Macros used to manipulate directory off_t's + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY) +__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno, + int entry); +#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ + xfs_da_make_bnoentry(mp,bno,entry) +#else +#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ + (((bno) << (mp)->m_dircook_elog) | (entry)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE) +xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry, + xfs_dahash_t hash); +#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ + xfs_da_make_cookie(mp,bno,entry,hash) +#else +#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ + (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH) +xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_HASH(mp,cookie) xfs_da_cookie_hash(mp,cookie) +#else +#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)(cookie)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO) +xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_BNO(mp,cookie) xfs_da_cookie_bno(mp,cookie) +#else +#define XFS_DA_COOKIE_BNO(mp,cookie) \ + (((xfs_off_t)(cookie) >> 31) == -1LL ? \ + (xfs_dablk_t)0 : \ + (xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY) +int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie); +#define XFS_DA_COOKIE_ENTRY(mp,cookie) xfs_da_cookie_entry(mp,cookie) +#else +#define XFS_DA_COOKIE_ENTRY(mp,cookie) \ + (((xfs_off_t)(cookie) >> 31) == -1LL ? \ + (xfs_dablk_t)0 : \ + (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \ + ((1 << (mp)->m_dircook_elog) - 1))) +#endif + + +/*======================================================================== + * Btree searching and modification structure definitions. + *========================================================================*/ + +/* + * Structure to ease passing around component names. + */ +typedef struct xfs_da_args { + uchar_t *name; /* string (maybe not NULL terminated) */ + int namelen; /* length of string (maybe no NULL) */ + uchar_t *value; /* set of bytes (maybe contain NULLs) */ + int valuelen; /* length of value */ + int flags; /* argument flags (eg: ATTR_NOCREATE) */ + xfs_dahash_t hashval; /* hash value of name */ + xfs_ino_t inumber; /* input/output inode number */ + struct xfs_inode *dp; /* directory inode to manipulate */ + xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ + struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_trans *trans; /* current trans (changes over time) */ + xfs_extlen_t total; /* total blocks needed, for 1st bmap */ + int whichfork; /* data or attribute fork */ + xfs_dablk_t blkno; /* blkno of attr leaf of interest */ + int index; /* index of attr of interest in blk */ + xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ + int rmtblkcnt; /* remote attr value block count */ + xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ + int index2; /* index of 2nd attr in blk */ + xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ + int rmtblkcnt2; /* remote attr value block count */ + unsigned char justcheck; /* T/F: check for ok with no space */ + unsigned char rename; /* T/F: this is an atomic rename op */ + unsigned char addname; /* T/F: this is an add operation */ + unsigned char oknoent; /* T/F: ok to return ENOENT, else die */ +} xfs_da_args_t; + +/* + * Structure to describe buffer(s) for a block. + * This is needed in the directory version 2 format case, when + * multiple non-contiguous fsblocks might be needed to cover one + * logical directory block. + * If the buffer count is 1 then the data pointer points to the + * same place as the b_addr field for the buffer, else to kmem_alloced memory. + */ +typedef struct xfs_dabuf { + int nbuf; /* number of buffer pointers present */ + short dirty; /* data needs to be copied back */ + short bbcount; /* how large is data in bbs */ + void *data; /* pointer for buffers' data */ +#ifdef XFS_DABUF_DEBUG + inst_t *ra; /* return address of caller to make */ + struct xfs_dabuf *next; /* next in global chain */ + struct xfs_dabuf *prev; /* previous in global chain */ + struct xfs_buftarg *target; /* device for buffer */ + xfs_daddr_t blkno; /* daddr first in bps[0] */ +#endif + struct xfs_buf *bps[1]; /* actually nbuf of these */ +} xfs_dabuf_t; +#define XFS_DA_BUF_SIZE(n) \ + (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) + +#ifdef XFS_DABUF_DEBUG +extern xfs_dabuf_t *xfs_dabuf_global_list; +#endif + +/* + * Storage for holding state during Btree searches and split/join ops. + * + * Only need space for 5 intermediate nodes. With a minimum of 62-way + * fanout to the Btree, we can support over 900 million directory blocks, + * which is slightly more than enough. + */ +typedef struct xfs_da_state_blk { + xfs_dabuf_t *bp; /* buffer containing block */ + xfs_dablk_t blkno; /* filesystem blkno of buffer */ + xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ + int index; /* relevant index into block */ + xfs_dahash_t hashval; /* last hash value in block */ + int magic; /* blk's magic number, ie: blk type */ +} xfs_da_state_blk_t; + +typedef struct xfs_da_state_path { + int active; /* number of active levels */ + xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH]; +} xfs_da_state_path_t; + +typedef struct xfs_da_state { + xfs_da_args_t *args; /* filename arguments */ + struct xfs_mount *mp; /* filesystem mount point */ + unsigned int blocksize; /* logical block size */ + unsigned int node_ents; /* how many entries in danode */ + xfs_da_state_path_t path; /* search/split paths */ + xfs_da_state_path_t altpath; /* alternate path for join */ + unsigned char inleaf; /* insert into 1->lf, 0->splf */ + unsigned char extravalid; /* T/F: extrablk is in use */ + unsigned char extraafter; /* T/F: extrablk is after new */ + xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ + /* for dirv2 extrablk is data */ +} xfs_da_state_t; + +/* + * Utility macros to aid in logging changed structure fields. + */ +#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) +#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) + + +#ifdef __KERNEL__ +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, + xfs_dabuf_t **bpp, int whichfork); +int xfs_da_split(xfs_da_state_t *state); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_da_join(xfs_da_state_t *state); +void xfs_da_fixhashpath(xfs_da_state_t *state, + xfs_da_state_path_t *path_to_to_fix); + +/* + * Routines used for finding things in the Btree. + */ +int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result); +/* + * Utility routines. + */ +int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); +int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk); + +/* + * Utility routines. + */ +int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bp, int whichfork); +int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + xfs_dabuf_t **bpp, int whichfork); +xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, int whichfork); +int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + xfs_dabuf_t *dead_buf); + +uint xfs_da_hashname(uchar_t *name_string, int name_length); +uint xfs_da_log2_roundup(uint i); +xfs_da_state_t *xfs_da_state_alloc(void); +void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_kill_altpath(xfs_da_state_t *state); + +void xfs_da_buf_done(xfs_dabuf_t *dabuf); +void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, + uint last); +void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); +xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); + +extern struct kmem_zone *xfs_da_state_zone; +#endif /* __KERNEL__ */ + +#endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c new file mode 100644 index 00000000000..08d551a1734 --- /dev/null +++ b/fs/xfs/xfs_dfrag.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_dfrag.h" +#include "xfs_error.h" +#include "xfs_mac.h" +#include "xfs_rw.h" + +/* + * Syssgi interface for swapext + */ +int +xfs_swapext( + xfs_swapext_t __user *sxp) +{ + xfs_swapext_t sx; + xfs_inode_t *ip=NULL, *tip=NULL, *ips[2]; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_bstat_t *sbp; + struct file *fp = NULL, *tfp = NULL; + vnode_t *vp, *tvp; + bhv_desc_t *bdp, *tbdp; + vn_bhv_head_t *bhp, *tbhp; + uint lock_flags=0; + int ilf_fields, tilf_fields; + int error = 0; + xfs_ifork_t tempif, *ifp, *tifp; + __uint64_t tmp; + int aforkblks = 0; + int taforkblks = 0; + int locked = 0; + + if (copy_from_user(&sx, sxp, sizeof(sx))) + return XFS_ERROR(EFAULT); + + /* Pull information for the target fd */ + if (((fp = fget((int)sx.sx_fdtarget)) == NULL) || + ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + bhp = VN_BHV_HEAD(vp); + bdp = vn_bhv_lookup(bhp, &xfs_vnodeops); + if (bdp == NULL) { + error = XFS_ERROR(EBADF); + goto error0; + } else { + ip = XFS_BHVTOI(bdp); + } + + if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) || + ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + tbhp = VN_BHV_HEAD(tvp); + tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops); + if (tbdp == NULL) { + error = XFS_ERROR(EBADF); + goto error0; + } else { + tip = XFS_BHVTOI(tbdp); + } + + if (ip->i_mount != tip->i_mount) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + mp = ip->i_mount; + + sbp = &sx.sx_stat; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto error0; + } + + locked = 1; + + /* Lock in i_ino order */ + if (ip->i_ino < tip->i_ino) { + ips[0] = ip; + ips[1] = tip; + } else { + ips[0] = tip; + ips[1] = ip; + } + lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; + xfs_lock_inodes(ips, 2, 0, lock_flags); + + /* Check permissions */ + error = xfs_iaccess(ip, S_IWUSR, NULL); + if (error) + goto error0; + + error = xfs_iaccess(tip, S_IWUSR, NULL); + if (error) + goto error0; + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* Verify both files are either real-time or non-realtime */ + if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != + (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + if (VN_CACHED(tvp) != 0) + xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore), + (loff_t)0, 0, 0); + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(tvp) != 0) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* Verify all data are being swapped */ + if (sx.sx_offset != 0 || + sx.sx_length != ip->i_d.di_size || + sx.sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto error0; + } + + /* + * If the target has extended attributes, the tmp file + * must also in order to ensure the correct data fork + * format. + */ + if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { + error = XFS_ERROR(EINVAL); + goto error0; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || + (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || + (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || + (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + error = XFS_ERROR(EBUSY); + goto error0; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * VOP_READ (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(vp)) { + error = XFS_ERROR(EBUSY); + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + + VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + if ((error = xfs_trans_reserve(tp, 0, + XFS_ICHANGE_LOG_RES(mp), 0, + 0, 0))) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + return error; + } + xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + xfs_trans_cancel(tp, 0); + return error; + } + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + xfs_trans_cancel(tp, 0); + return error; + } + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + tempif = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = tempif; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + ilf_fields = XFS_ILOG_CORE; + + switch(ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + ilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ilf_fields |= XFS_ILOG_DBROOT; + break; + } + + tilf_fields = XFS_ILOG_CORE; + + switch(tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + tilf_fields |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + tilf_fields |= XFS_ILOG_DBROOT; + break; + } + + /* + * Increment vnode ref counts since xfs_trans_commit & + * xfs_trans_cancel will both unlock the inodes and + * decrement the associated ref counts. + */ + VN_HOLD(vp); + VN_HOLD(tvp); + + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + + xfs_trans_log_inode(tp, ip, ilf_fields); + xfs_trans_log_inode(tp, tip, tilf_fields); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) { + xfs_trans_set_sync(tp); + } + + error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL); + + fput(fp); + fput(tfp); + + return error; + + error0: + if (locked) { + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + } + + if (fp != NULL) fput(fp); + if (tfp != NULL) fput(tfp); + + return error; +} diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h new file mode 100644 index 00000000000..904860594b8 --- /dev/null +++ b/fs/xfs/xfs_dfrag.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DFRAG_H__ +#define __XFS_DFRAG_H__ + +/* + * Structure passed to xfs_swapext + */ + +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* + * Version flag + */ +#define XFS_SX_VERSION 0 + +#ifdef __KERNEL__ +/* + * Prototypes for visible xfs_dfrag.c routines. + */ + +/* + * Syscall interface for xfs_swapext + */ +int xfs_swapext(struct xfs_swapext __user *sx); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h new file mode 100644 index 00000000000..f5c932b064e --- /dev/null +++ b/fs/xfs/xfs_dinode.h @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DINODE_H__ +#define __XFS_DINODE_H__ + +struct xfs_buf; +struct xfs_mount; + +#define XFS_DINODE_VERSION_1 1 +#define XFS_DINODE_VERSION_2 2 +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION) +int xfs_dinode_good_version(int v); +#define XFS_DINODE_GOOD_VERSION(v) xfs_dinode_good_version(v) +#else +#define XFS_DINODE_GOOD_VERSION(v) (((v) == XFS_DINODE_VERSION_1) || \ + ((v) == XFS_DINODE_VERSION_2)) +#endif +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ + +/* + * Disk inode structure. + * This is just the header; the inode is expanded to fill a variable size + * with the last field expanding. It is split into the core and "other" + * because we only need the core part in the in-core inode. + */ +typedef struct xfs_timestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * Note: Coordinate changes to this structure with the XFS_DI_* #defines + * below and the offsets table in xfs_ialloc_log_di(). + */ +typedef struct xfs_dinode_core +{ + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid; /* owner's project id */ + __uint8_t di_pad[8]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ +} xfs_dinode_core_t; + +#define DI_MAX_FLUSH 0xffff + +typedef struct xfs_dinode +{ + xfs_dinode_core_t di_core; + /* + * In adding anything between the core and the union, be + * sure to update the macros like XFS_LITINO below and + * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h. + */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + union { + xfs_bmdr_block_t di_bmbt; /* btree root block */ + xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */ + xfs_dir_shortform_t di_dirsf; /* shortform directory */ + xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ + char di_c[1]; /* local contents */ + xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */ + uuid_t di_muuid; /* mount point value */ + char di_symlink[1]; /* local symbolic link */ + } di_u; + union { + xfs_bmdr_block_t di_abmbt; /* btree root block */ + xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */ + xfs_attr_shortform_t di_attrsf; /* shortform attribute list */ + } di_a; +} xfs_dinode_t; + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Bit names for logging disk inodes only + */ +#define XFS_DI_MAGIC 0x0000001 +#define XFS_DI_MODE 0x0000002 +#define XFS_DI_VERSION 0x0000004 +#define XFS_DI_FORMAT 0x0000008 +#define XFS_DI_ONLINK 0x0000010 +#define XFS_DI_UID 0x0000020 +#define XFS_DI_GID 0x0000040 +#define XFS_DI_NLINK 0x0000080 +#define XFS_DI_PROJID 0x0000100 +#define XFS_DI_PAD 0x0000200 +#define XFS_DI_ATIME 0x0000400 +#define XFS_DI_MTIME 0x0000800 +#define XFS_DI_CTIME 0x0001000 +#define XFS_DI_SIZE 0x0002000 +#define XFS_DI_NBLOCKS 0x0004000 +#define XFS_DI_EXTSIZE 0x0008000 +#define XFS_DI_NEXTENTS 0x0010000 +#define XFS_DI_NAEXTENTS 0x0020000 +#define XFS_DI_FORKOFF 0x0040000 +#define XFS_DI_AFORMAT 0x0080000 +#define XFS_DI_DMEVMASK 0x0100000 +#define XFS_DI_DMSTATE 0x0200000 +#define XFS_DI_FLAGS 0x0400000 +#define XFS_DI_GEN 0x0800000 +#define XFS_DI_NEXT_UNLINKED 0x1000000 +#define XFS_DI_U 0x2000000 +#define XFS_DI_A 0x4000000 +#define XFS_DI_NUM_BITS 27 +#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1) +#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A)) + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt +{ + XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */ + XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */ + /* LNK: di_symlink */ + XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */ + XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */ + XFS_DINODE_FMT_UUID /* MNT: di_uuid */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO) +int xfs_litino(struct xfs_mount *mp); +#define XFS_LITINO(mp) xfs_litino(mp) +#else +#define XFS_LITINO(mp) ((mp)->m_litino) +#endif +#define XFS_BROOT_SIZE_ADJ \ + (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t)) + +/* + * Fork identifiers. Here so utilities can use them without including + * xfs_inode.h. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* + * Inode data & attribute fork sizes, per inode. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q) +int xfs_cfork_q_disk(xfs_dinode_core_t *dcp); +int xfs_cfork_q(xfs_dinode_core_t *dcp); +#define XFS_CFORK_Q_DISK(dcp) xfs_cfork_q_disk(dcp) +#define XFS_CFORK_Q(dcp) xfs_cfork_q(dcp) +#else +#define XFS_CFORK_Q_DISK(dcp) ((dcp)->di_forkoff != 0) +#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF) +int xfs_cfork_boff_disk(xfs_dinode_core_t *dcp); +int xfs_cfork_boff(xfs_dinode_core_t *dcp); +#define XFS_CFORK_BOFF_DISK(dcp) xfs_cfork_boff_disk(dcp) +#define XFS_CFORK_BOFF(dcp) xfs_cfork_boff(dcp) +#else +#define XFS_CFORK_BOFF_DISK(dcp) ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3)) +#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE) +int xfs_cfork_dsize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +#define XFS_CFORK_DSIZE_DISK(dcp,mp) xfs_cfork_dsize_disk(dcp,mp) +#define XFS_CFORK_DSIZE(dcp,mp) xfs_cfork_dsize(dcp,mp) +#else +#define XFS_CFORK_DSIZE_DISK(dcp,mp) \ + (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp)) +#define XFS_CFORK_DSIZE(dcp,mp) \ + (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE) +int xfs_cfork_asize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp); +#define XFS_CFORK_ASIZE_DISK(dcp,mp) xfs_cfork_asize_disk(dcp,mp) +#define XFS_CFORK_ASIZE(dcp,mp) xfs_cfork_asize(dcp,mp) +#else +#define XFS_CFORK_ASIZE_DISK(dcp,mp) \ + (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0) +#define XFS_CFORK_ASIZE(dcp,mp) \ + (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE) +int xfs_cfork_size_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w); +int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w); +#define XFS_CFORK_SIZE_DISK(dcp,mp,w) xfs_cfork_size_disk(dcp,mp,w) +#define XFS_CFORK_SIZE(dcp,mp,w) xfs_cfork_size(dcp,mp,w) +#else +#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_CFORK_DSIZE_DISK(dcp, mp) : \ + XFS_CFORK_ASIZE_DISK(dcp, mp)) +#define XFS_CFORK_SIZE(dcp,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp)) + +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE) +int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp); +#define XFS_DFORK_DSIZE(dip,mp) xfs_dfork_dsize(dip,mp) +#else +#define XFS_DFORK_DSIZE(dip,mp) XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE) +int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp); +#define XFS_DFORK_ASIZE(dip,mp) xfs_dfork_asize(dip,mp) +#else +#define XFS_DFORK_ASIZE(dip,mp) XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE) +int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w); +#define XFS_DFORK_SIZE(dip,mp,w) xfs_dfork_size(dip,mp,w) +#else +#define XFS_DFORK_SIZE(dip,mp,w) XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w) + +#endif + +/* + * Macros for accessing per-fork disk inode information. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q) +int xfs_dfork_q(xfs_dinode_t *dip); +#define XFS_DFORK_Q(dip) xfs_dfork_q(dip) +#else +#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF) +int xfs_dfork_boff(xfs_dinode_t *dip); +#define XFS_DFORK_BOFF(dip) xfs_dfork_boff(dip) +#else +#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR) +char *xfs_dfork_dptr(xfs_dinode_t *dip); +#define XFS_DFORK_DPTR(dip) xfs_dfork_dptr(dip) +#else +#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR) +char *xfs_dfork_aptr(xfs_dinode_t *dip); +#define XFS_DFORK_APTR(dip) xfs_dfork_aptr(dip) +#else +#define XFS_DFORK_APTR(dip) ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR) +char *xfs_dfork_ptr(xfs_dinode_t *dip, int w); +#define XFS_DFORK_PTR(dip,w) xfs_dfork_ptr(dip,w) +#else +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT) +int xfs_cfork_format(xfs_dinode_core_t *dcp, int w); +#define XFS_CFORK_FORMAT(dcp,w) xfs_cfork_format(dcp,w) +#else +#define XFS_CFORK_FORMAT(dcp,w) \ + ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET) +void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n); +#define XFS_CFORK_FMT_SET(dcp,w,n) xfs_cfork_fmt_set(dcp,w,n) +#else +#define XFS_CFORK_FMT_SET(dcp,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((dcp)->di_format = (n)) : \ + ((dcp)->di_aformat = (n))) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS) +int xfs_cfork_nextents_disk(xfs_dinode_core_t *dcp, int w); +int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w); +#define XFS_CFORK_NEXTENTS_DISK(dcp,w) xfs_cfork_nextents_disk(dcp,w) +#define XFS_CFORK_NEXTENTS(dcp,w) xfs_cfork_nextents(dcp,w) +#else +#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \ + ((w) == XFS_DATA_FORK ? \ + INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \ + INT_GET((dcp)->di_anextents, ARCH_CONVERT)) +#define XFS_CFORK_NEXTENTS(dcp,w) \ + ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents) + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET) +void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n); +#define XFS_CFORK_NEXT_SET(dcp,w,n) xfs_cfork_next_set(dcp,w,n) +#else +#define XFS_CFORK_NEXT_SET(dcp,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((dcp)->di_nextents = (n)) : \ + ((dcp)->di_anextents = (n))) + +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS) +int xfs_dfork_nextents(xfs_dinode_t *dip, int w); +#define XFS_DFORK_NEXTENTS(dip,w) xfs_dfork_nextents(dip,w) +#else +#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE) +xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp); +#define XFS_BUF_TO_DINODE(bp) xfs_buf_to_dinode(bp) +#else +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)(XFS_BUF_PTR(bp))) +#endif + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS) + +#endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c new file mode 100644 index 00000000000..ba30bc7682f --- /dev/null +++ b/fs/xfs/xfs_dir.c @@ -0,0 +1,1223 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_error.h" + +/* + * xfs_dir.c + * + * Provide the external interfaces to manage directories. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Functions for the dirops interfaces. + */ +static void xfs_dir_mount(struct xfs_mount *mp); + +static int xfs_dir_isempty(struct xfs_inode *dp); + +static int xfs_dir_init(struct xfs_trans *trans, + struct xfs_inode *dir, + struct xfs_inode *parent_dir); + +static int xfs_dir_createname(struct xfs_trans *trans, + struct xfs_inode *dp, + char *name_string, + int name_len, + xfs_ino_t inode_number, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_lookup(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t *inode_number); + +static int xfs_dir_removename(struct xfs_trans *trans, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t ino, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_getdents(struct xfs_trans *tp, + struct xfs_inode *dp, + struct uio *uiop, + int *eofp); + +static int xfs_dir_replace(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length, + xfs_ino_t inode_number, + xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, + xfs_extlen_t total); + +static int xfs_dir_canenter(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name_string, + int name_length); + +static int xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, + xfs_dinode_t *dip); + +xfs_dirops_t xfsv1_dirops = { + .xd_mount = xfs_dir_mount, + .xd_isempty = xfs_dir_isempty, + .xd_init = xfs_dir_init, + .xd_createname = xfs_dir_createname, + .xd_lookup = xfs_dir_lookup, + .xd_removename = xfs_dir_removename, + .xd_getdents = xfs_dir_getdents, + .xd_replace = xfs_dir_replace, + .xd_canenter = xfs_dir_canenter, + .xd_shortform_validate_ondisk = xfs_dir_shortform_validate_ondisk, + .xd_shortform_to_single = xfs_dir_shortform_to_leaf, +}; + +/* + * Internal routines when dirsize == XFS_LBSIZE(mp). + */ +STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args); +STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries, + int *total_namebytes); +STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, + uio_t *uio, int *eofp, + xfs_dirent_t *dbp, + xfs_dir_put_t put); +STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args); + +/* + * Internal routines when dirsize > XFS_LBSIZE(mp). + */ +STATIC int xfs_dir_node_addname(xfs_da_args_t *args); +STATIC int xfs_dir_node_lookup(xfs_da_args_t *args); +STATIC int xfs_dir_node_removename(xfs_da_args_t *args); +STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, + uio_t *uio, int *eofp, + xfs_dirent_t *dbp, + xfs_dir_put_t put); +STATIC int xfs_dir_node_replace(xfs_da_args_t *args); + +#if defined(XFS_DIR_TRACE) +ktrace_t *xfs_dir_trace_buf; +#endif + + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +/* + * One-time startup routine called from xfs_init(). + */ +void +xfs_dir_startup(void) +{ + xfs_dir_hash_dot = xfs_da_hashname(".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); +} + +/* + * Initialize directory-related fields in the mount structure. + */ +static void +xfs_dir_mount(xfs_mount_t *mp) +{ + uint shortcount, leafcount, count; + + mp->m_dirversion = 1; + shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) / + (uint)sizeof(xfs_dir_sf_entry_t); + leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) / + ((uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_name_t)); + count = shortcount > leafcount ? shortcount : leafcount; + mp->m_dircook_elog = xfs_da_log2_roundup(count + 1); + ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog); + mp->m_dir_node_ents = mp->m_attr_node_ents = + (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100; + mp->m_dirblksize = mp->m_sb.sb_blocksize; + mp->m_dirblkfsbs = 1; +} + +/* + * Return 1 if directory contains only "." and "..". + */ +static int +xfs_dir_isempty(xfs_inode_t *dp) +{ + xfs_dir_sf_hdr_t *hdr; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if (dp->i_d.di_size == 0) + return(1); + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return(0); + hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data; + return(hdr->count == 0); +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +static int +xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir) +{ + xfs_da_args_t args; + int error; + + memset((char *)&args, 0, sizeof(args)); + args.dp = dir; + args.trans = trans; + + ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR); + if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino))) + return error; + + return(xfs_dir_shortform_create(&args, parent_dir->i_ino)); +} + +/* + * Generic handler routine to add a name to a directory. + * Transitions directory from shortform to Btree as necessary. + */ +static int /* error */ +xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int retval, newsize, done; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) + return (retval); + + XFS_STATS_INC(xs_dir_create); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = 0; + args.addname = args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + done = 0; + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen); + if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) { + retval = xfs_dir_shortform_addname(&args); + done = 1; + } else { + if (total == 0) + return XFS_ERROR(ENOSPC); + retval = xfs_dir_shortform_to_leaf(&args); + done = retval != 0; + } + } + if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_addname(&args); + done = retval != ENOSPC; + if (!done) { + if (total == 0) + return XFS_ERROR(ENOSPC); + retval = xfs_dir_leaf_to_node(&args); + done = retval != 0; + } + } + if (!done) { + retval = xfs_dir_node_addname(&args); + } + return(retval); +} + +/* + * Generic handler routine to check if a name can be added to a directory, + * without adding any blocks to the directory. + */ +static int /* error */ +xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen) +{ + xfs_da_args_t args; + int retval, newsize; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen); + if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) + retval = 0; + else + retval = XFS_ERROR(ENOSPC); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_addname(&args); + } else { + retval = xfs_dir_node_addname(&args); + } + return(retval); +} + +/* + * Generic handler routine to remove a name from a directory. + * Transitions directory from Btree to shortform as necessary. + */ +static int /* error */ +xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int count, totallen, newsize, retval; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_remove); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = ino; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 0; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_removename(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_removename(&args, &count, &totallen); + if (retval == 0) { + newsize = XFS_DIR_SF_ALLFIT(count, totallen); + if (newsize <= XFS_IFORK_DSIZE(dp)) { + retval = xfs_dir_leaf_to_shortform(&args); + } + } + } else { + retval = xfs_dir_node_removename(&args); + } + return(retval); +} + +static int /* error */ +xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen, + xfs_ino_t *inum) +{ + xfs_da_args_t args; + int retval; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + XFS_STATS_INC(xs_dir_lookup); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = 0; + args.oknoent = 1; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_lookup(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_lookup(&args); + } else { + retval = xfs_dir_node_lookup(&args); + } + if (retval == EEXIST) + retval = 0; + *inum = args.inumber; + return(retval); +} + +/* + * Implement readdir. + */ +static int /* error */ +xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp) +{ + xfs_dirent_t *dbp; + int alignment, retval; + xfs_dir_put_t put; + + XFS_STATS_INC(xs_dir_getdents); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + /* + * If our caller has given us a single contiguous memory buffer, + * just work directly within that buffer. If it's in user memory, + * lock it down first. + */ + alignment = sizeof(xfs_off_t) - 1; + if ((uio->uio_iovcnt == 1) && + (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && + ((uio->uio_iov[0].iov_len & alignment) == 0)) { + dbp = NULL; + put = xfs_dir_put_dirent64_direct; + } else { + dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); + put = xfs_dir_put_dirent64_uio; + } + + /* + * Decide on what work routines to call based on the inode size. + */ + *eofp = 0; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put); + } else { + retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put); + } + if (dbp != NULL) + kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + + return(retval); +} + +static int /* error */ +xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen, + xfs_ino_t inum, xfs_fsblock_t *firstblock, + xfs_bmap_free_t *flist, xfs_extlen_t total) +{ + xfs_da_args_t args; + int retval; + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) + return retval; + + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = firstblock; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = trans; + args.justcheck = args.addname = args.oknoent = 0; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + retval = xfs_dir_shortform_replace(&args); + } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) { + retval = xfs_dir_leaf_replace(&args); + } else { + retval = xfs_dir_node_replace(&args); + } + + return(retval); +} + +static int +xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp) +{ + xfs_ino_t ino; + int namelen_sum; + int count; + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int i; + + + + if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) { + return 0; + } + if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) { + return 0; + } + if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) { + xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p", + dp); + return 1; + } + sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf); + ino = XFS_GET_DIR_INO8(sf->hdr.parent); + if (xfs_dir_ino_validate(mp, ino)) + return 1; + + count = sf->hdr.count; + if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform count: dp 0x%p", dp); + return(1); + } + + if (count == 0) { + return 0; + } + + namelen_sum = 0; + sfe = &sf->list[0]; + for (i = sf->hdr.count - 1; i >= 0; i--) { + ino = XFS_GET_DIR_INO8(sfe->inumber); + xfs_dir_ino_validate(mp, ino); + if (sfe->namelen >= XFS_LITINO(mp)) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform namelen: dp 0x%p", dp); + return 1; + } + namelen_sum += sfe->namelen; + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + if (namelen_sum >= XFS_LITINO(mp)) { + xfs_fs_cmn_err(CE_WARN, mp, + "Invalid shortform namelen: dp 0x%p", dp); + return 1; + } + + return 0; +} + +/*======================================================================== + * External routines when dirsize == XFS_LBSIZE(dp->i_mount). + *========================================================================*/ + +/* + * Add a name to the leaf directory structure + * This is the external routine. + */ +int +xfs_dir_leaf_addname(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == ENOENT) + retval = xfs_dir_leaf_add(bp, args, index); + xfs_da_buf_done(bp); + return(retval); +} + +/* + * Remove a name from the leaf directory structure + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen) +{ + xfs_dir_leafblock_t *leaf; + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == EEXIST) { + (void)xfs_dir_leaf_remove(args->trans, bp, index); + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + retval = 0; + } + xfs_da_buf_done(bp); + return(retval); +} + +/* + * Look up a name in a leaf directory structure. + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_lookup(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + xfs_da_brelse(args->trans, bp); + return(retval); +} + +/* + * Copy out directory entries for getdents(), for leaf directories. + */ +STATIC int +xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, + int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put) +{ + xfs_dabuf_t *bp; + int retval, eob; + + retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1); + xfs_da_brelse(trans, bp); + *eofp = (eob == 0); + return(retval); +} + +/* + * Look up a name in a leaf directory structure, replace the inode number. + * This is the external routine. + */ +STATIC int +xfs_dir_leaf_replace(xfs_da_args_t *args) +{ + int index, retval; + xfs_dabuf_t *bp; + xfs_ino_t inum; + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + + inum = args->inumber; + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + retval = xfs_dir_leaf_lookup_int(bp, args, &index); + if (retval == EEXIST) { + leaf = bp->data; + entry = &leaf->entries[index]; + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + /* XXX - replace assert? */ + XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber))); + xfs_da_buf_done(bp); + retval = 0; + } else + xfs_da_brelse(args->trans, bp); + return(retval); +} + + +/*======================================================================== + * External routines when dirsize > XFS_LBSIZE(mp). + *========================================================================*/ + +/* + * Add a name to a Btree-format directory. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_dir_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int retval, error; + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_dir_node_ents; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + retval = error; + if (retval != ENOENT) + goto error; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_add(blk->bp, args, blk->index); + if (retval == 0) { + /* + * Addition succeeded, update Btree hashvals. + */ + if (!args->justcheck) + xfs_da_fixhashpath(state, &state->path); + } else { + /* + * Addition failed, split as many Btree elements as required. + */ + if (args->total == 0) { + ASSERT(retval == ENOSPC); + goto error; + } + retval = xfs_da_split(state); + } +error: + xfs_da_state_free(state); + + return(retval); +} + +/* + * Remove a name from a B-tree directory. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_dir_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int retval, error; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_dir_node_ents; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) + retval = error; + if (retval != EEXIST) { + xfs_da_state_free(state); + return(retval); + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index); + xfs_da_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + error = 0; + if (retval) { + error = xfs_da_join(state); + } + + xfs_da_state_free(state); + if (error) + return(error); + return(0); +} + +/* + * Look up a filename in a int directory. + * Use an internal routine to actually do all the work. + */ +STATIC int +xfs_dir_node_lookup(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + int retval, error, i; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_dir_node_ents; + + /* + * Search to see if name exists, + * and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +STATIC int +xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, + int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put) +{ + xfs_da_intnode_t *node; + xfs_da_node_entry_t *btree; + xfs_dir_leafblock_t *leaf = NULL; + xfs_dablk_t bno, nextbno; + xfs_dahash_t cookhash; + xfs_mount_t *mp; + int error, eob, i; + xfs_dabuf_t *bp; + xfs_daddr_t nextda; + + /* + * Pick up our context. + */ + mp = dp->i_mount; + bp = NULL; + bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset); + cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset); + + xfs_dir_trace_g_du("node: start", dp, uio); + + /* + * Re-find our place, even if we're confused about what our place is. + * + * First we check the block number from the magic cookie, it is a + * cache of where we ended last time. If we find a leaf block, and + * the starting hashval in that block is less than our desired + * hashval, then we run with it. + */ + if (bno > 0) { + error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) + leaf = bp->data; + if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) { + xfs_dir_trace_g_dub("node: block not a leaf", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) { + xfs_dir_trace_g_dub("node: leaf hash too large", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + if (bp && + cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) { + xfs_dir_trace_g_dub("node: leaf hash too small", + dp, uio, bno); + xfs_da_brelse(trans, bp); + bp = NULL; + } + } + + /* + * If we did not find a leaf block from the blockno in the cookie, + * or we there was no blockno in the cookie (eg: first time thru), + * the we start at the top of the Btree and re-find our hashval. + */ + if (bp == NULL) { + xfs_dir_trace_g_du("node: start at root" , dp, uio); + bno = 0; + for (;;) { + error = xfs_da_read_buf(trans, dp, bno, -1, &bp, + XFS_DATA_FORK); + if (error) + return(error); + if (bp == NULL) + return(XFS_ERROR(EFSCORRUPTED)); + node = bp->data; + if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC) + break; + btree = &node->btree[0]; + xfs_dir_trace_g_dun("node: node detail", dp, uio, node); + for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) { + if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) { + bno = INT_GET(btree->before, ARCH_CONVERT); + break; + } + } + if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) { + xfs_da_brelse(trans, bp); + xfs_dir_trace_g_du("node: hash beyond EOF", + dp, uio); + uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, + XFS_DA_MAXHASH); + *eofp = 1; + return(0); + } + xfs_dir_trace_g_dub("node: going to block", + dp, uio, bno); + xfs_da_brelse(trans, bp); + } + } + ASSERT(cookhash != XFS_DA_MAXHASH); + + /* + * We've dropped down to the (first) leaf block that contains the + * hashval we are interested in. Continue rolling upward thru the + * leaf blocks until we fill up our buffer. + */ + for (;;) { + leaf = bp->data; + if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC)) { + xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf); + xfs_da_brelse(trans, bp); + XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)", + XFS_ERRLEVEL_LOW, mp, leaf); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf); + if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) { + nextda = xfs_da_reada_buf(trans, dp, nextbno, + XFS_DATA_FORK); + } else + nextda = -1; + error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp, + put, nextda); + xfs_da_brelse(trans, bp); + bno = nextbno; + if (eob) { + xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno); + *eofp = 0; + return(error); + } + if (bno == 0) + break; + error = xfs_da_read_buf(trans, dp, bno, nextda, &bp, + XFS_DATA_FORK); + if (error) + return(error); + if (unlikely(bp == NULL)) { + XFS_ERROR_REPORT("xfs_dir_node_getdents(2)", + XFS_ERRLEVEL_LOW, mp); + return(XFS_ERROR(EFSCORRUPTED)); + } + } + *eofp = 1; + xfs_dir_trace_g_du("node: E-O-F", dp, uio); + return(0); +} + +/* + * Look up a filename in an int directory, replace the inode number. + * Use an internal routine to actually do the lookup. + */ +STATIC int +xfs_dir_node_replace(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + xfs_ino_t inum; + int retval, error, i; + xfs_dabuf_t *bp; + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_sb.sb_blocksize; + state->node_ents = state->mp->m_dir_node_ents; + inum = args->inumber; + + /* + * Search to see if name exists, + * and get back a pointer to it. + */ + error = xfs_da_node_lookup_int(state, &retval); + if (error) { + retval = error; + } + + if (retval == EEXIST) { + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC); + bp = blk->bp; + leaf = bp->data; + entry = &leaf->entries[blk->index]; + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + /* XXX - replace assert ? */ + XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber))); + xfs_da_buf_done(bp); + blk->bp = NULL; + retval = 0; + } else { + i = state->path.active - 1; + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + for (i = 0; i < state->path.active - 1; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return(retval); +} + +#if defined(XFS_DIR_TRACE) +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + NULL, NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + (void *)(unsigned long)bno, + NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_da_intnode_t *node) +{ + int last = INT_GET(node->hdr.count, ARCH_CONVERT) - 1; + + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + (void *)(unsigned long) + INT_GET(node->hdr.info.forw, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(node->hdr.count, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(node->btree[0].hashval, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(node->btree[last].hashval, ARCH_CONVERT), + NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_dir_leafblock_t *leaf) +{ + int last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1; + + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + (void *)(unsigned long) + INT_GET(leaf->hdr.info.forw, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(leaf->hdr.count, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(leaf->entries[0].hashval, ARCH_CONVERT), + (void *)(unsigned long) + INT_GET(leaf->entries[last].hashval, ARCH_CONVERT), + NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio, + xfs_dir_leaf_entry_t *entry) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + (void *)(unsigned long) + INT_GET(entry->hashval, ARCH_CONVERT), + NULL, NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for an inode and a uio. + */ +void +xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie) +{ + xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where, + (void *)dp, (void *)dp->i_mount, + (void *)((unsigned long)(uio->uio_offset >> 32)), + (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)), + (void *)(unsigned long)uio->uio_resid, + (void *)((unsigned long)(cookie >> 32)), + (void *)((unsigned long)(cookie & 0xFFFFFFFF)), + NULL, NULL, NULL, NULL, NULL); +} + +/* + * Add a trace buffer entry for the arguments given to the routine, + * generic form. + */ +void +xfs_dir_trace_enter(int type, char *where, + void * a0, void * a1, + void * a2, void * a3, + void * a4, void * a5, + void * a6, void * a7, + void * a8, void * a9, + void * a10, void * a11) +{ + ASSERT(xfs_dir_trace_buf); + ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type, + (void *)where, + (void *)a0, (void *)a1, (void *)a2, + (void *)a3, (void *)a4, (void *)a5, + (void *)a6, (void *)a7, (void *)a8, + (void *)a9, (void *)a10, (void *)a11, + NULL, NULL); +} +#endif /* XFS_DIR_TRACE */ diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h new file mode 100644 index 00000000000..4dbc9f54cca --- /dev/null +++ b/fs/xfs/xfs_dir.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR_H__ +#define __XFS_DIR_H__ + +/* + * Large directories are structured around Btrees where all the data + * elements are in the leaf nodes. Filenames are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of a filename may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Small directories use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +struct uio; +struct xfs_bmap_free; +struct xfs_da_args; +struct xfs_dinode; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Directory function types. + * Put in structures (xfs_dirops_t) for v1 and v2 directories. + */ +typedef void (*xfs_dir_mount_t)(struct xfs_mount *mp); +typedef int (*xfs_dir_isempty_t)(struct xfs_inode *dp); +typedef int (*xfs_dir_init_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_inode *pdp); +typedef int (*xfs_dir_createname_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_lookup_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t *inum); +typedef int (*xfs_dir_removename_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t ino, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_getdents_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + struct uio *uio, + int *eofp); +typedef int (*xfs_dir_replace_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen, + xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, + xfs_extlen_t total); +typedef int (*xfs_dir_canenter_t)(struct xfs_trans *tp, + struct xfs_inode *dp, + char *name, + int namelen); +typedef int (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp, + struct xfs_dinode *dip); +typedef int (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args); + +typedef struct xfs_dirops { + xfs_dir_mount_t xd_mount; + xfs_dir_isempty_t xd_isempty; + xfs_dir_init_t xd_init; + xfs_dir_createname_t xd_createname; + xfs_dir_lookup_t xd_lookup; + xfs_dir_removename_t xd_removename; + xfs_dir_getdents_t xd_getdents; + xfs_dir_replace_t xd_replace; + xfs_dir_canenter_t xd_canenter; + xfs_dir_shortform_validate_ondisk_t xd_shortform_validate_ondisk; + xfs_dir_shortform_to_single_t xd_shortform_to_single; +} xfs_dirops_t; + +/* + * Overall external interface routines. + */ +void xfs_dir_startup(void); /* called exactly once */ + +#define XFS_DIR_MOUNT(mp) \ + ((mp)->m_dirops.xd_mount(mp)) +#define XFS_DIR_ISEMPTY(mp,dp) \ + ((mp)->m_dirops.xd_isempty(dp)) +#define XFS_DIR_INIT(mp,tp,dp,pdp) \ + ((mp)->m_dirops.xd_init(tp,dp,pdp)) +#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \ + ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\ + total)) +#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum) \ + ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum)) +#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \ + ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total)) +#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp) \ + ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp)) +#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total) \ + ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total)) +#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \ + ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen)) +#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip) \ + ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip)) +#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args) \ + ((mp)->m_dirops.xd_shortform_to_single(args)) + +#define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1) +extern xfs_dirops_t xfsv1_dirops; + +#endif /* __XFS_DIR_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c new file mode 100644 index 00000000000..49fc0a3695a --- /dev/null +++ b/fs/xfs/xfs_dir2.c @@ -0,0 +1,859 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * XFS v2 directory implmentation. + * Top-level and utility routines. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_dir2_trace.h" +#include "xfs_error.h" +#include "xfs_bit.h" + +/* + * Declarations for interface routines. + */ +static void xfs_dir2_mount(xfs_mount_t *mp); +static int xfs_dir2_isempty(xfs_inode_t *dp); +static int xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp, + xfs_inode_t *pdp); +static int xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp, + char *name, int namelen, xfs_ino_t inum, + xfs_fsblock_t *first, + xfs_bmap_free_t *flist, xfs_extlen_t total); +static int xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t *inum); +static int xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp, + char *name, int namelen, xfs_ino_t ino, + xfs_fsblock_t *first, + xfs_bmap_free_t *flist, xfs_extlen_t total); +static int xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio, + int *eofp); +static int xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen, xfs_ino_t inum, + xfs_fsblock_t *first, xfs_bmap_free_t *flist, + xfs_extlen_t total); +static int xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name, + int namelen); +static int xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp, + xfs_dinode_t *dip); + +/* + * Utility routine declarations. + */ +static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa); +static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa); + +/* + * Directory operations vector. + */ +xfs_dirops_t xfsv2_dirops = { + .xd_mount = xfs_dir2_mount, + .xd_isempty = xfs_dir2_isempty, + .xd_init = xfs_dir2_init, + .xd_createname = xfs_dir2_createname, + .xd_lookup = xfs_dir2_lookup, + .xd_removename = xfs_dir2_removename, + .xd_getdents = xfs_dir2_getdents, + .xd_replace = xfs_dir2_replace, + .xd_canenter = xfs_dir2_canenter, + .xd_shortform_validate_ondisk = xfs_dir2_shortform_validate_ondisk, + .xd_shortform_to_single = xfs_dir2_sf_to_block, +}; + +/* + * Interface routines. + */ + +/* + * Initialize directory-related fields in the mount structure. + */ +static void +xfs_dir2_mount( + xfs_mount_t *mp) /* filesystem mount point */ +{ + mp->m_dirversion = 2; + ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= + XFS_MAX_BLOCKSIZE); + mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); + mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; + mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); + mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); + mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); + mp->m_attr_node_ents = + (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_node_ents = + (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; +} + +/* + * Return 1 if directory contains only "." and "..". + */ +static int /* return code */ +xfs_dir2_isempty( + xfs_inode_t *dp) /* incore inode structure */ +{ + xfs_dir2_sf_t *sfp; /* shortform directory structure */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + /* + * Might happen during shutdown. + */ + if (dp->i_d.di_size == 0) { + return 1; + } + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return 0; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + return !sfp->hdr.count; +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +static int /* error */ +xfs_dir2_init( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + xfs_inode_t *pdp) /* incore parent directory inode */ +{ + xfs_da_args_t args; /* operation arguments */ + int error; /* error return value */ + + memset((char *)&args, 0, sizeof(args)); + args.dp = dp; + args.trans = tp; + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) { + return error; + } + return xfs_dir2_sf_create(&args, pdp->i_ino); +} + +/* + Enter a name in a directory. + */ +static int /* error */ +xfs_dir2_createname( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* new entry name */ + int namelen, /* new entry name length */ + xfs_ino_t inum, /* new entry inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { + return rval; + } + XFS_STATS_INC(xs_dir_create); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = 0; + args.addname = args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * Lookup a name in a directory, give back the inode number. + */ +static int /* error */ +xfs_dir2_lookup( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* lookup name */ + int namelen, /* lookup name length */ + xfs_ino_t *inum) /* out: inode number */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_lookup); + + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = 0; + args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_lookup(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_lookup(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_lookup(&args); + else + rval = xfs_dir2_node_lookup(&args); + if (rval == EEXIST) + rval = 0; + if (rval == 0) + *inum = args.inumber; + return rval; +} + +/* + * Remove an entry from a directory. + */ +static int /* error */ +xfs_dir2_removename( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to remove */ + int namelen, /* name length of entry to remove */ + xfs_ino_t ino, /* inode number of entry to remove */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_remove); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = ino; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_removename(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_removename(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_removename(&args); + else + rval = xfs_dir2_node_removename(&args); + return rval; +} + +/* + * Read a directory. + */ +static int /* error */ +xfs_dir2_getdents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + uio_t *uio, /* caller's buffer control */ + int *eofp) /* out: eof reached */ +{ + int alignment; /* alignment required for ABI */ + xfs_dirent_t *dbp; /* malloc'ed buffer */ + xfs_dir2_put_t put; /* entry formatting routine */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + XFS_STATS_INC(xs_dir_getdents); + /* + * If our caller has given us a single contiguous aligned memory buffer, + * just work directly within that buffer. If it's in user memory, + * lock it down first. + */ + alignment = sizeof(xfs_off_t) - 1; + if ((uio->uio_iovcnt == 1) && + (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && + ((uio->uio_iov[0].iov_len & alignment) == 0)) { + dbp = NULL; + put = xfs_dir2_put_dirent64_direct; + } else { + dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); + put = xfs_dir2_put_dirent64_uio; + } + + *eofp = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + ; + } else if (v) + rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put); + else + rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put); + if (dbp != NULL) + kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + return rval; +} + +/* + * Replace the inode number of a directory entry. + */ +static int /* error */ +xfs_dir2_replace( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to replace */ + int namelen, /* name length of entry to replace */ + xfs_ino_t inum, /* new inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + + if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { + return rval; + } + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = inum; + args.dp = dp; + args.firstblock = first; + args.flist = flist; + args.total = total; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 0; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_replace(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_replace(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_replace(&args); + else + rval = xfs_dir2_node_replace(&args); + return rval; +} + +/* + * See if this entry can be added to the directory without allocating space. + */ +static int /* error */ +xfs_dir2_canenter( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + char *name, /* name of entry to add */ + int namelen) /* name length of entry to add */ +{ + xfs_da_args_t args; /* operation arguments */ + int rval; /* return value */ + int v; /* type-checking value */ + + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + /* + * Fill in the arg structure for this request. + */ + args.name = name; + args.namelen = namelen; + args.hashval = xfs_da_hashname(name, namelen); + args.inumber = 0; + args.dp = dp; + args.firstblock = NULL; + args.flist = NULL; + args.total = 0; + args.whichfork = XFS_DATA_FORK; + args.trans = tp; + args.justcheck = args.addname = args.oknoent = 1; + /* + * Decide on what work routines to call based on the inode size. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_addname(&args); + else if ((rval = xfs_dir2_isblock(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_block_addname(&args); + else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) { + return rval; + } else if (v) + rval = xfs_dir2_leaf_addname(&args); + else + rval = xfs_dir2_node_addname(&args); + return rval; +} + +/* + * Dummy routine for shortform inode validation. + * Can't really do this. + */ +/* ARGSUSED */ +static int /* error */ +xfs_dir2_shortform_validate_ondisk( + xfs_mount_t *mp, /* filesystem mount point */ + xfs_dinode_t *dip) /* ondisk inode */ +{ + return 0; +} + +/* + * Utility routines. + */ + +/* + * Add a block to the directory. + * This routine is for data and free blocks, not leaf/node blocks + * which are handled by xfs_da_grow_inode. + */ +int /* error */ +xfs_dir2_grow_inode( + xfs_da_args_t *args, /* operation arguments */ + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ +{ + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int got; /* blocks actually mapped */ + int i; /* temp mapping index */ + xfs_bmbt_irec_t map; /* single structure for bmap */ + int mapi; /* mapping index */ + xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */ + xfs_mount_t *mp; /* filesystem mount point */ + int nmap; /* number of bmap entries */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_s("grow_inode", args, space); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Set lowest possible block in the space requested. + */ + bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); + count = mp->m_dirblkfsbs; + /* + * Find the first hole for our block. + */ + if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) { + return error; + } + nmap = 1; + ASSERT(args->firstblock != NULL); + /* + * Try mapping the new block contiguously (one extent). + */ + if ((error = xfs_bmapi(tp, dp, bno, count, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist))) { + return error; + } + ASSERT(nmap <= 1); + /* + * Got it in 1. + */ + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } + /* + * Didn't work and this is a multiple-fsb directory block. + * Try again with contiguous flag turned on. + */ + else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; /* current file offset */ + + /* + * Space for maximum number of mappings. + */ + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + /* + * Iterate until we get to the end of our block. + */ + for (b = bno, mapi = 0; b < bno + count; ) { + int c; /* current fsb count */ + + /* + * Can't map more than MAX_NMAP at once. + */ + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(bno + count - b); + if ((error = xfs_bmapi(tp, dp, b, c, + XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist))) { + kmem_free(mapp, sizeof(*mapp) * count); + return error; + } + if (nmap < 1) + break; + /* + * Add this bunch into our table, go to the next offset. + */ + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } + /* + * Didn't work. + */ + else { + mapi = 0; + mapp = NULL; + } + /* + * See how many fsb's we got. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + /* + * Didn't get enough fsb's, or the first/last block's are wrong. + */ + if (got != count || mapp[0].br_startoff != bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + bno + count) { + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + return XFS_ERROR(ENOSPC); + } + /* + * Done with the temporary mapping table. + */ + if (mapp != &map) + kmem_free(mapp, sizeof(*mapp) * count); + *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); + /* + * Update file's size if this is the data space and it grew. + */ + if (space == XFS_DIR2_DATA_SPACE) { + xfs_fsize_t size; /* directory file (data) size */ + + size = XFS_FSB_TO_B(mp, bno + count); + if (size > dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * See if the directory is a single-block form directory. + */ +int /* error */ +xfs_dir2_isblock( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* return value */ + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) { + return rval; + } + rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; + ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + *vp = rval; + return 0; +} + +/* + * See if the directory is a single-leaf form directory. + */ +int /* error */ +xfs_dir2_isleaf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + int *vp) /* out: 1 is leaf, 0 is not leaf */ +{ + xfs_fileoff_t last; /* last file offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* return value */ + + mp = dp->i_mount; + if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) { + return rval; + } + *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + return 0; +} + +/* + * Getdents put routine for 64-bit ABI, direct form. + */ +static int /* error */ +xfs_dir2_put_dirent64_direct( + xfs_dir2_put_args_t *pa) /* argument bundle */ +{ + xfs_dirent_t *idbp; /* dirent pointer */ + iovec_t *iovp; /* io vector */ + int namelen; /* entry name length */ + int reclen; /* entry total length */ + uio_t *uio; /* I/O control */ + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + /* + * Won't fit in the remaining space. + */ + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + iovp = uio->uio_iov; + idbp = (xfs_dirent_t *)iovp->iov_base; + iovp->iov_base = (char *)idbp + reclen; + iovp->iov_len -= reclen; + uio->uio_resid -= reclen; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook; + idbp->d_name[namelen] = '\0'; + pa->done = 1; + memcpy(idbp->d_name, pa->name, namelen); + return 0; +} + +/* + * Getdents put routine for 64-bit ABI, uio form. + */ +static int /* error */ +xfs_dir2_put_dirent64_uio( + xfs_dir2_put_args_t *pa) /* argument bundle */ +{ + xfs_dirent_t *idbp; /* dirent pointer */ + int namelen; /* entry name length */ + int reclen; /* entry total length */ + int rval; /* return value */ + uio_t *uio; /* I/O control */ + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + /* + * Won't fit in the remaining space. + */ + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + idbp = pa->dbp; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook; + idbp->d_name[namelen] = '\0'; + memcpy(idbp->d_name, pa->name, namelen); + rval = uio_read((caddr_t)idbp, reclen, uio); + pa->done = (rval == 0); + return rval; +} + +/* + * Remove the given block from the directory. + * This routine is used for data and free blocks, leaf/node are done + * by xfs_da_shrink_inode. + */ +int +xfs_dir2_shrink_inode( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_db_t db, /* directory block number */ + xfs_dabuf_t *bp) /* block's buffer */ +{ + xfs_fileoff_t bno; /* directory file offset */ + xfs_dablk_t da; /* directory file offset */ + int done; /* bunmap is finished */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_db("shrink_inode", args, db, bp); + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + da = XFS_DIR2_DB_TO_DA(mp, db); + /* + * Unmap the fsblock(s). + */ + if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, + &done))) { + /* + * ENOSPC actually can happen if we're in a removename with + * no space reservation, and the resulting block removal + * would cause a bmap btree split or conversion from extents + * to btree. This can only happen for un-fragmented + * directory blocks, since you need to be punching out + * the middle of an extent. + * In this case we need to leave the block in the file, + * and not binval it. + * So the block has to be in a consistent empty state + * and appropriately logged. + * We don't free up the buffer, the caller can tell it + * hasn't happened since it got an error back. + */ + return error; + } + ASSERT(done); + /* + * Invalidate the buffer from the transaction. + */ + xfs_da_binval(tp, bp); + /* + * If it's not a data block, we're done. + */ + if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + return 0; + /* + * If the block isn't the last one in the directory, we're done. + */ + if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) + return 0; + bno = da; + if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { + /* + * This can't really happen unless there's kernel corruption. + */ + return error; + } + if (db == mp->m_dirdatablk) + ASSERT(bno == 0); + else + ASSERT(bno > 0); + /* + * Set the size to the new last block. + */ + dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h new file mode 100644 index 00000000000..8f4fc7f23bc --- /dev/null +++ b/fs/xfs/xfs_dir2.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_H__ +#define __XFS_DIR2_H__ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_put_args; +struct xfs_inode; +struct xfs_trans; + +/* + * Directory version 2. + * There are 4 possible formats: + * shortform + * single block - data with embedded leaf at the end + * multiple data blocks, single leaf+freeindex block + * data blocks, node&leaf blocks (btree), freeindex blocks + * + * The shortform format is in xfs_dir2_sf.h. + * The single block format is in xfs_dir2_block.h. + * The data block format is in xfs_dir2_data.h. + * The leaf and freeindex block formats are in xfs_dir2_leaf.h. + * Node blocks are the same as the other version, in xfs_da_btree.h. + */ + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * For getdents, argument struct for put routines. + */ +typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa); +typedef struct xfs_dir2_put_args { + xfs_off_t cook; /* cookie of (next) entry */ + xfs_intino_t ino; /* inode number */ + struct xfs_dirent *dbp; /* buffer pointer */ + char *name; /* directory entry name */ + int namelen; /* length of name */ + int done; /* output: set if value was stored */ + xfs_dir2_put_t put; /* put function ptr (i/o) */ + struct uio *uio; /* uio control structure */ +} xfs_dir2_put_args_t; + +#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2) +extern xfs_dirops_t xfsv2_dirops; + +/* + * Other interfaces used by the rest of the dir v2 code. + */ +extern int + xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); + +extern int + xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp); + +extern int + xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp); + +extern int + xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_dabuf *bp); + +#endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c new file mode 100644 index 00000000000..bc4c40fcd47 --- /dev/null +++ b/fs/xfs/xfs_dir2_block.c @@ -0,0 +1,1248 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_block.c + * XFS V2 directory implementation, single-block form. + * See xfs_dir2_block.h for the format. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_trace.h" +#include "xfs_error.h" + +/* + * Local function prototypes. + */ +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, + int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, + int *entno); +static int xfs_dir2_block_sort(const void *a, const void *b); + +/* + * Add an entry to a block directory. + */ +int /* error */ +xfs_dir2_block_addname( + xfs_da_args_t *args) /* directory op arguments */ +{ + xfs_dir2_data_free_t *bf; /* bestfree table in block */ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + int compact; /* need to compact leaf ents */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* directory inode */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + int error; /* error return value */ + xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ + xfs_dahash_t hash; /* hash value of found entry */ + int high; /* high index for binary srch */ + int highstale; /* high stale index */ + int lfloghigh=0; /* last final leaf to log */ + int lfloglow=0; /* first final leaf to log */ + int len; /* length of the new entry */ + int low; /* low index for binary srch */ + int lowstale; /* low stale index */ + int mid=0; /* midpoint for binary srch */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log header */ + int needscan; /* need to rescan freespace */ + xfs_dir2_data_off_t *tagp; /* pointer to tag value */ + xfs_trans_t *tp; /* transaction structure */ + + xfs_dir2_trace_args("block_addname", args); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the (one and only) directory block into dabuf bp. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + /* + * Check the magic number, corrupted if wrong. + */ + if (unlikely(INT_GET(block->hdr.magic, ARCH_CONVERT) + != XFS_DIR2_BLOCK_MAGIC)) { + XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", + XFS_ERRLEVEL_LOW, mp, block); + xfs_da_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + len = XFS_DIR2_DATA_ENTSIZE(args->namelen); + /* + * Set up pointers to parts of the block. + */ + bf = block->hdr.bestfree; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * No stale entries? Need space for entry and new leaf. + */ + if (!btp->stale) { + /* + * Tag just before the first leaf entry. + */ + tagp = (xfs_dir2_data_off_t *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG) + dup = enddup = NULL; + /* + * Check out the biggest freespace and see if it's the same one. + */ + else { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT)); + if (dup == enddup) { + /* + * It is the biggest freespace, is it too small + * to hold the new leaf too? + */ + if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) { + /* + * Yes, we use the second-largest + * entry instead if it works. + */ + if (INT_GET(bf[1].length, ARCH_CONVERT) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)block + + INT_GET(bf[1].offset, ARCH_CONVERT)); + else + dup = NULL; + } + } else { + /* + * Not the same free entry, + * just check its length. + */ + if (INT_GET(dup->length, ARCH_CONVERT) < len) { + dup = NULL; + } + } + } + compact = 0; + } + /* + * If there are stale entries we'll use one for the leaf. + * Is the biggest entry enough to avoid compaction? + */ + else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT)); + compact = 0; + } + /* + * Will need to compact to make this work. + */ + else { + /* + * Tag just before the first leaf entry. + */ + tagp = (xfs_dir2_data_off_t *)blp - 1; + /* + * Data object just before the first leaf entry. + */ + dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + compact = 1; + } + /* + * If this isn't a real add, we're done with the buffer. + */ + if (args->justcheck) + xfs_da_brelse(tp, bp); + /* + * If we don't have space for the new entry & leaf ... + */ + if (!dup) { + /* + * Not trying to actually do anything, or don't have + * a space reservation: return no-space. + */ + if (args->justcheck || args->total == 0) + return XFS_ERROR(ENOSPC); + /* + * Convert to the next larger format. + * Then add the new entry in that format. + */ + error = xfs_dir2_block_to_leaf(args, bp); + xfs_da_buf_done(bp); + if (error) + return error; + return xfs_dir2_leaf_addname(args); + } + /* + * Just checking, and it would work, so say so. + */ + if (args->justcheck) + return 0; + needlog = needscan = 0; + /* + * If need to compact the leaf entries, do it now. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ + if (compact) { + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + + for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1, + highstale = lfloghigh = -1; + fromidx >= 0; + fromidx--) { + if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) { + if (highstale == -1) + highstale = toidx; + else { + if (lfloghigh == -1) + lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1); + lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1; + INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)), + &needlog, &needscan); + blp += INT_GET(btp->stale, ARCH_CONVERT) - 1; + INT_SET(btp->stale, ARCH_CONVERT, 1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, + &needlog, NULL); + needscan = 0; + } + } + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ + else if (INT_GET(btp->stale, ARCH_CONVERT)) { + lfloglow = INT_GET(btp->count, ARCH_CONVERT); + lfloghigh = -1; + } + /* + * Find the slot that's first lower than our hash value, -1 if none. + */ + for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) { + mid = (low + high) >> 1; + if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + } + while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) { + mid--; + } + /* + * No stale entries, will use enddup space to hold new leaf. + */ + if (!btp->stale) { + /* + * Mark the space needed for the new leaf entry, now in use. + */ + xfs_dir2_data_use_free(tp, bp, enddup, + (xfs_dir2_data_aoff_t) + ((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) - + sizeof(*blp)), + (xfs_dir2_data_aoff_t)sizeof(*blp), + &needlog, &needscan); + /* + * Update the tail (entry count). + */ + INT_MOD(btp->count, ARCH_CONVERT, +1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, + &needlog, NULL); + needscan = 0; + } + /* + * Adjust pointer to the first leaf entry, we're about to move + * the table up one to open up space for the new leaf entry. + * Then adjust our index to match. + */ + blp--; + mid++; + if (mid) + memmove(blp, &blp[1], mid * sizeof(*blp)); + lfloglow = 0; + lfloghigh = mid; + } + /* + * Use a stale leaf for our new entry. + */ + else { + for (lowstale = mid; + lowstale >= 0 && + INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + for (highstale = mid + 1; + highstale < INT_GET(btp->count, ARCH_CONVERT) && + INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || mid - lowstale > highstale - mid); + highstale++) + continue; + /* + * Move entries toward the low-numbered stale entry. + */ + if (lowstale >= 0 && + (highstale == INT_GET(btp->count, ARCH_CONVERT) || + mid - lowstale <= highstale - mid)) { + if (mid - lowstale) + memmove(&blp[lowstale], &blp[lowstale + 1], + (mid - lowstale) * sizeof(*blp)); + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(mid, lfloghigh); + } + /* + * Move entries toward the high-numbered stale entry. + */ + else { + ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT)); + mid++; + if (highstale - mid) + memmove(&blp[mid + 1], &blp[mid], + (highstale - mid) * sizeof(*blp)); + lfloglow = MIN(mid, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + INT_MOD(btp->stale, ARCH_CONVERT, -1); + } + /* + * Point to the new data entry. + */ + dep = (xfs_dir2_data_entry_t *)dup; + /* + * Fill in the leaf entry. + */ + INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval); + INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block)); + xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); + /* + * Mark space for the data entry used. + */ + xfs_dir2_data_use_free(tp, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + /* + * Create the new data entry. + */ + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, args->namelen); + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block)); + /* + * Clean up the bestfree array and log the header, tail, and entry. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_log_entry(tp, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Readdir for block directories. + */ +int /* error */ +xfs_dir2_block_getdents( + xfs_trans_t *tp, /* transaction (NULL) */ + xfs_inode_t *dp, /* incore inode */ + uio_t *uio, /* caller's buffer control */ + int *eofp, /* eof reached? (out) */ + xfs_dirent_t *dbp, /* caller's buffer */ + xfs_dir2_put_t put) /* abi's formatting function */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dabuf_t *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_put_args_t p; /* arg package for put rtn */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + + mp = dp->i_mount; + /* + * If the block number in the offset is out of range, we're done. + */ + if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { + *eofp = 1; + return 0; + } + /* + * Can't read the block, give up, else get dabuf in bp. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); + block = bp->data; + xfs_dir2_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + ptr = (char *)block->u; + endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + p.dbp = dbp; + p.put = put; + p.uio = uio; + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + ptr += INT_GET(dup->length, ARCH_CONVERT); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)block < wantoff) + continue; + /* + * Set up argument structure for put routine. + */ + p.namelen = dep->namelen; + + p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + ptr - (char *)block); + p.ino = INT_GET(dep->inumber, ARCH_CONVERT); +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = (char *)dep->name; + + /* + * Put the entry in the caller's buffer. + */ + error = p.put(&p); + + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!p.done) { + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + (char *)dep - (char *)block); + xfs_da_brelse(tp, bp); + return error; + } + } + + /* + * Reached the end of the block. + * Set the offset to a nonexistent block 1 and return. + */ + *eofp = 1; + + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); + + xfs_da_brelse(tp, bp); + + return 0; +} + +/* + * Log leaf entries from the block. + */ +static void +xfs_dir2_block_log_leaf( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp, /* block buffer */ + int first, /* index of first logged leaf */ + int last) /* index of last logged leaf */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), + (uint)((char *)&blp[last + 1] - (char *)block - 1)); +} + +/* + * Log the block tail. + */ +static void +xfs_dir2_block_log_tail( + xfs_trans_t *tp, /* transaction structure */ + xfs_dabuf_t *bp) /* block buffer */ +{ + xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), + (uint)((char *)(btp + 1) - (char *)block - 1)); +} + +/* + * Look up an entry in the block. This is the external routine, + * xfs_dir2_block_lookup_int does the real work. + */ +int /* error */ +xfs_dir2_block_lookup( + xfs_da_args_t *args) /* dir lookup arguments */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + xfs_dir2_trace_args("block_lookup", args); + /* + * Get the buffer, look up the entry. + * If not found (ENOENT) then return, have no buffer. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) + return error; + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * Get the offset from the leaf entry, to point to the data. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + /* + * Fill in inode number, release the block. + */ + args->inumber = INT_GET(dep->inumber, ARCH_CONVERT); + xfs_da_brelse(args->trans, bp); + return XFS_ERROR(EEXIST); +} + +/* + * Internal block lookup routine. + */ +static int /* error */ +xfs_dir2_block_lookup_int( + xfs_da_args_t *args, /* dir lookup arguments */ + xfs_dabuf_t **bpp, /* returned block buffer */ + int *entno) /* returned entry number */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int error; /* error return value */ + xfs_dahash_t hash; /* found hash value */ + int high; /* binary search high index */ + int low; /* binary search low index */ + int mid; /* binary search current idx */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the buffer, return error if we can't get it. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + block = bp->data; + xfs_dir2_data_check(dp, bp); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * Loop doing a binary search for our hash value. + * Find our entry, ENOENT if it's not there. + */ + for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) { + ASSERT(low <= high); + mid = (low + high) >> 1; + if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + if (low > high) { + ASSERT(args->oknoent); + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); + } + } + /* + * Back up to the first one with the right hash value. + */ + while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) { + mid--; + } + /* + * Now loop forward through all the entries with the + * right hash value looking for our name. + */ + do { + if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get pointer to the entry from the leaf. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + /* + * Compare, if it's right give back buffer & entry number. + */ + if (dep->namelen == args->namelen && + dep->name[0] == args->name[0] && + memcmp(dep->name, args->name, args->namelen) == 0) { + *bpp = bp; + *entno = mid; + return 0; + } + } while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash); + /* + * No match, release the buffer and return ENOENT. + */ + ASSERT(args->oknoent); + xfs_da_brelse(tp, bp); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a block format directory. + * If that makes the block small enough to fit in shortform, transform it. + */ +int /* error */ +xfs_dir2_block_removename( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* block leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to fixup bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* shortform size */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("block_removename", args); + /* + * Look up the entry in the block. Gets the buffer and entry index. + * It will always be there, the vnodeops level does a lookup first. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * Point to the data entry using the leaf entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + /* + * Mark the data entry's space free. + */ + needlog = needscan = 0; + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), + XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + /* + * Fix up the block tail. + */ + INT_MOD(btp->stale, ARCH_CONVERT, +1); + xfs_dir2_block_log_tail(tp, bp); + /* + * Remove the leaf entry by marking it stale. + */ + INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR); + xfs_dir2_block_log_leaf(tp, bp, ent, ent); + /* + * Fix up bestfree, log the header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_check(dp, bp); + /* + * See if the size as a shortform is good enough. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + xfs_da_buf_done(bp); + return 0; + } + /* + * If it works, do the conversion. + */ + return xfs_dir2_block_to_sf(args, bp, size, &sfh); +} + +/* + * Replace an entry in a V2 block directory. + * Change the inode number to the new value. + */ +int /* error */ +xfs_dir2_block_replace( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* leaf entry index */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + xfs_dir2_trace_args("block_replace", args); + /* + * Lookup the entry in the directory. Get buffer and entry index. + * This will always succeed since the caller has already done a lookup. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + mp = dp->i_mount; + block = bp->data; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * Point to the data entry we need to change. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT))); + ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber); + /* + * Change the inode number to the new value. + */ + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + xfs_dir2_data_log_entry(args->trans, bp, dep); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} + +/* + * Qsort comparison routine for the block leaf entries. + */ +static int /* sort order */ +xfs_dir2_block_sort( + const void *a, /* first leaf entry */ + const void *b) /* second leaf entry */ +{ + const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ + const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ + + la = a; + lb = b; + return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 : + (INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0); +} + +/* + * Convert a V2 leaf directory to a V2 block directory if possible. + */ +int /* error */ +xfs_dir2_leaf_to_block( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp, /* leaf buffer */ + xfs_dabuf_t *dbp) /* data buffer */ +{ + xfs_dir2_data_off_t *bestsp; /* leaf bests table */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + int error; /* error return value */ + int from; /* leaf from index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* file system mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to scan for bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* bytes used */ + xfs_dir2_data_off_t *tagp; /* end of entry (tag) */ + int to; /* block/leaf to index */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC); + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + /* + * If there are data blocks other than the first one, take this + * opportunity to remove trailing empty data blocks that may have + * been left behind during no-space-reservation operations. + * These will show up in the leaf bests table. + */ + while (dp->i_d.di_size > mp->m_dirblksize) { + bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) == + mp->m_dirblksize - (uint)sizeof(block->hdr)) { + if ((error = + xfs_dir2_leaf_trim_data(args, lbp, + (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1)))) + goto out; + } else { + error = 0; + goto out; + } + } + /* + * Read the data block if we don't already have it, give up if it fails. + */ + if (dbp == NULL && + (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK))) { + goto out; + } + block = dbp->data; + ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC); + /* + * Size of the "leaf" area in the block. + */ + size = (uint)sizeof(block->tail) + + (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)); + /* + * Look at the last data entry. + */ + tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT)); + /* + * If it's not free or is too short we can't do it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) { + error = 0; + goto out; + } + /* + * Start converting it to block form. + */ + INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC); + needlog = 1; + needscan = 0; + /* + * Use up the space at the end of the block (blp/btp). + */ + xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + &needlog, &needscan); + /* + * Initialize the block tail. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)); + btp->stale = 0; + xfs_dir2_block_log_tail(tp, dbp); + /* + * Initialize the block leaf area. We compact out stale entries. + */ + lep = XFS_DIR2_BLOCK_LEAF_P(btp); + for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) { + if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + lep[to++] = leaf->ents[from]; + } + ASSERT(to == INT_GET(btp->count, ARCH_CONVERT)); + xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1); + /* + * Scan the bestfree if we need it and log the data block header. + */ + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * Pitch the old leaf block. + */ + error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + lbp = NULL; + if (error) { + goto out; + } + /* + * Now see if the resulting block can be shrunken to shortform. + */ + if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > + XFS_IFORK_DSIZE(dp)) { + error = 0; + goto out; + } + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); +out: + if (lbp) + xfs_da_buf_done(lbp); + if (dbp) + xfs_da_buf_done(dbp); + return error; +} + +/* + * Convert the shortform directory to block form. + */ +int /* error */ +xfs_dir2_sf_to_block( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_db_t blkno; /* dir-relative block # (0) */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + char *buf; /* sf buffer */ + int buf_len; + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + int dummy; /* trash */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int endoffset; /* end of data objects */ + int error; /* error return value */ + int i; /* index */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to scan block freespc */ + int newoffset; /* offset from current entry */ + int offset; /* target block offset */ + xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_data_off_t *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("sf_to_block", args); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bomb out if the shortform directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + /* + * Copy the directory into the stack buffer. + * Then pitch the incore inode data so we can make extents. + */ + + buf_len = dp->i_df.if_bytes; + buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); + + memcpy(buf, sfp, dp->i_df.if_bytes); + xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); + dp->i_d.di_size = 0; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + /* + * Reset pointer - old sfp is gone. + */ + sfp = (xfs_dir2_sf_t *)buf; + /* + * Add block 0 to the inode. + */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); + if (error) { + kmem_free(buf, buf_len); + return error; + } + /* + * Initialize the data block. + */ + error = xfs_dir2_data_init(args, blkno, &bp); + if (error) { + kmem_free(buf, buf_len); + return error; + } + block = bp->data; + INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC); + /* + * Compute size of block "tail" area. + */ + i = (uint)sizeof(*btp) + + (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + /* + * The whole thing is initialized to free by the init routine. + * Say we're using the leaf and tail area. + */ + dup = (xfs_dir2_data_unused_t *)block->u; + needlog = needscan = 0; + xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, + &needscan); + ASSERT(needscan == 0); + /* + * Fill in the tail. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2); /* ., .. */ + btp->stale = 0; + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + endoffset = (uint)((char *)blp - (char *)block); + /* + * Remove the freespace, we'll manage it. + */ + xfs_dir2_data_use_free(tp, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan); + /* + * Create entry for . + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATA_DOT_OFFSET); + INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino); + dep->namelen = 1; + dep->name[0] = '.'; + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block)); + xfs_dir2_data_log_entry(tp, bp, dep); + INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot); + INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block)); + /* + * Create entry for .. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); + INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + dep->namelen = 2; + dep->name[0] = dep->name[1] = '.'; + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block)); + xfs_dir2_data_log_entry(tp, bp, dep); + INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot); + INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block)); + offset = XFS_DIR2_DATA_FIRST_OFFSET; + /* + * Loop over existing entries, stuff them in. + */ + if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT)) + sfep = NULL; + else + sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + /* + * Need to preserve the existing offset values in the sf directory. + * Insert holes (unused entries) where necessary. + */ + while (offset < endoffset) { + /* + * sfep is null when we reach the end of the list. + */ + if (sfep == NULL) + newoffset = endoffset; + else + newoffset = XFS_DIR2_SF_GET_OFFSET(sfep); + /* + * There should be a hole here, make one. + */ + if (offset < newoffset) { + dup = (xfs_dir2_data_unused_t *) + ((char *)block + offset); + INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + INT_SET(dup->length, ARCH_CONVERT, newoffset - offset); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT, + (xfs_dir2_data_off_t) + ((char *)dup - (char *)block)); + xfs_dir2_data_log_unused(tp, bp, dup); + (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, + dup, &dummy); + offset += INT_GET(dup->length, ARCH_CONVERT); + continue; + } + /* + * Copy a real entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); + INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, + XFS_DIR2_SF_INUMBERP(sfep))); + dep->namelen = sfep->namelen; + memcpy(dep->name, sfep->name, dep->namelen); + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block)); + xfs_dir2_data_log_entry(tp, bp, dep); + INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen)); + INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, + (char *)dep - (char *)block)); + offset = (int)((char *)(tagp + 1) - (char *)block); + if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT)) + sfep = NULL; + else + sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + } + /* Done with the temporary buffer */ + kmem_free(buf, buf_len); + /* + * Sort the leaf entries by hash value. + */ + qsort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort); + /* + * Log the leaf entry area and tail. + * Already logged the header in data_init, ignore needlog. + */ + ASSERT(needscan == 0); + xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_check(dp, bp); + xfs_da_buf_done(bp); + return 0; +} diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h new file mode 100644 index 00000000000..5a578b84e24 --- /dev/null +++ b/fs/xfs/xfs_dir2_block.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_BLOCK_H__ +#define __XFS_DIR2_BLOCK_H__ + +/* + * xfs_dir2_block.h + * Directory version 2, single block format structures + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_data_hdr; +struct xfs_dir2_leaf_entry; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * The single block format is as follows: + * xfs_dir2_data_hdr_t structure + * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures + * xfs_dir2_leaf_entry_t structures + * xfs_dir2_block_tail_t structure + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */ + +typedef struct xfs_dir2_block_tail { + __uint32_t count; /* count of leaf entries */ + __uint32_t stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Generic single-block structure, for xfs_db. + */ +typedef struct xfs_dir2_block { + xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_union_t u[1]; + xfs_dir2_leaf_entry_t leaf[1]; + xfs_dir2_block_tail_t tail; +} xfs_dir2_block_t; + +/* + * Pointer to the leaf header embedded in a data block (1-block format) + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_TAIL_P) +xfs_dir2_block_tail_t * +xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block); +#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block) +#else +#define XFS_DIR2_BLOCK_TAIL_P(mp,block) \ + (((xfs_dir2_block_tail_t *)((char *)(block) + (mp)->m_dirblksize)) - 1) +#endif + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_LEAF_P) +struct xfs_dir2_leaf_entry *xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp); +#define XFS_DIR2_BLOCK_LEAF_P(btp) \ + xfs_dir2_block_leaf_p(btp) +#else +#define XFS_DIR2_BLOCK_LEAF_P(btp) \ + (((struct xfs_dir2_leaf_entry *)(btp)) - INT_GET((btp)->count, ARCH_CONVERT)) +#endif + +/* + * Function declarations. + */ + +extern int + xfs_dir2_block_addname(struct xfs_da_args *args); + +extern int + xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp, + struct uio *uio, int *eofp, struct xfs_dirent *dbp, + xfs_dir2_put_t put); + +extern int + xfs_dir2_block_lookup(struct xfs_da_args *args); + +extern int + xfs_dir2_block_removename(struct xfs_da_args *args); + +extern int + xfs_dir2_block_replace(struct xfs_da_args *args); + +extern int + xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_dabuf *lbp, + struct xfs_dabuf *dbp); + +extern int + xfs_dir2_sf_to_block(struct xfs_da_args *args); + +#endif /* __XFS_DIR2_BLOCK_H__ */ diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c new file mode 100644 index 00000000000..db9887a107d --- /dev/null +++ b/fs/xfs/xfs_dir2_data.c @@ -0,0 +1,855 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_data.c + * Core data block handling routines for XFS V2 directories. + * See xfs_dir2_data.h for data structures. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_error.h" + +#ifdef DEBUG +/* + * Check the consistency of the data block. + * The input can also be a block-format directory. + * Pop an assert if we find anything bad. + */ +void +xfs_dir2_data_check( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dabuf_t *bp) /* data block's buffer */ +{ + xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ + xfs_dir2_data_free_t *bf; /* bestfree table */ + xfs_dir2_block_tail_t *btp=NULL; /* block tail */ + int count; /* count of entries found */ + xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + char *endp; /* end of useful data */ + int freeseen; /* mask of bestfrees seen */ + xfs_dahash_t hash; /* hash of current name */ + int i; /* leaf index */ + int lastfree; /* last entry was unused */ + xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ + xfs_mount_t *mp; /* filesystem mount point */ + char *p; /* current data position */ + int stale; /* count of stale leaves */ + + mp = dp->i_mount; + d = bp->data; + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + bf = d->hdr.bestfree; + p = (char *)d->u; + if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) { + btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); + lep = XFS_DIR2_BLOCK_LEAF_P(btp); + endp = (char *)lep; + } else + endp = (char *)d + mp->m_dirblksize; + count = lastfree = freeseen = 0; + /* + * Account for zero bestfree entries. + */ + if (!bf[0].length) { + ASSERT(!bf[0].offset); + freeseen |= 1 << 0; + } + if (!bf[1].length) { + ASSERT(!bf[1].offset); + freeseen |= 1 << 1; + } + if (!bf[2].length) { + ASSERT(!bf[2].offset); + freeseen |= 1 << 2; + } + ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT)); + ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT)); + /* + * Loop over the data/unused entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's unused, look for the space in the bestfree table. + * If we find it, account for that, else make sure it + * doesn't need to be there. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT(lastfree == 0); + ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT) == + (char *)dup - (char *)d); + dfp = xfs_dir2_data_freefind(d, dup); + if (dfp) { + i = (int)(dfp - bf); + ASSERT((freeseen & (1 << i)) == 0); + freeseen |= 1 << i; + } else + ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT)); + p += INT_GET(dup->length, ARCH_CONVERT); + lastfree = 1; + continue; + } + /* + * It's a real entry. Validate the fields. + * If this is a block directory then make sure it's + * in the leaf section of the block. + * The linear search is crude but this is DEBUG code. + */ + dep = (xfs_dir2_data_entry_t *)p; + ASSERT(dep->namelen != 0); + ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0); + ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) == + (char *)dep - (char *)d); + count++; + lastfree = 0; + if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) { + addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)d)); + hash = xfs_da_hashname((char *)dep->name, dep->namelen); + for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) { + if (INT_GET(lep[i].address, ARCH_CONVERT) == addr && + INT_GET(lep[i].hashval, ARCH_CONVERT) == hash) + break; + } + ASSERT(i < INT_GET(btp->count, ARCH_CONVERT)); + } + p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + } + /* + * Need to have seen all the entries and all the bestfree slots. + */ + ASSERT(freeseen == 7); + if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) { + for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) { + if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + stale++; + if (i > 0) + ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT)); + } + ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT)); + ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT)); + } +} +#endif + +/* + * Given a data block and an unused entry from that block, + * return the bestfree entry if any that corresponds to it. + */ +xfs_dir2_data_free_t * +xfs_dir2_data_freefind( + xfs_dir2_data_t *d, /* data block */ + xfs_dir2_data_unused_t *dup) /* data unused entry */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_aoff_t off; /* offset value needed */ +#if defined(DEBUG) && defined(__KERNEL__) + int matched; /* matched the value */ + int seenzero; /* saw a 0 bestfree entry */ +#endif + + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d); +#if defined(DEBUG) && defined(__KERNEL__) + /* + * Validate some consistency in the bestfree table. + * Check order, non-overlapping entries, and if we find the + * one we're looking for it has to be exact. + */ + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0; + dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) { + ASSERT(!dfp->length); + seenzero = 1; + continue; + } + ASSERT(seenzero == 0); + if (INT_GET(dfp->offset, ARCH_CONVERT) == off) { + matched = 1; + ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT)); + } else if (off < INT_GET(dfp->offset, ARCH_CONVERT)) + ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT)); + else + ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off); + ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT)); + if (dfp > &d->hdr.bestfree[0]) + ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT)); + } +#endif + /* + * If this is smaller than the smallest bestfree entry, + * it can't be there since they're sorted. + */ + if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT)) + return NULL; + /* + * Look at the three bestfree entries for our guy. + */ + for (dfp = &d->hdr.bestfree[0]; + dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) + return NULL; + if (INT_GET(dfp->offset, ARCH_CONVERT) == off) + return dfp; + } + /* + * Didn't find it. This only happens if there are duplicate lengths. + */ + return NULL; +} + +/* + * Insert an unused-space entry into the bestfree table. + */ +xfs_dir2_data_free_t * /* entry inserted */ +xfs_dir2_data_freeinsert( + xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_unused_t *dup, /* unused space */ + int *loghead) /* log the data header (out) */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ + xfs_dir2_data_free_t new; /* new bestfree entry */ + +#ifdef __KERNEL__ + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); +#endif + dfp = d->hdr.bestfree; + INT_COPY(new.length, dup->length, ARCH_CONVERT); + INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d)); + /* + * Insert at position 0, 1, or 2; or not at all. + */ + if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) { + dfp[2] = dfp[1]; + dfp[1] = dfp[0]; + dfp[0] = new; + *loghead = 1; + return &dfp[0]; + } + if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) { + dfp[2] = dfp[1]; + dfp[1] = new; + *loghead = 1; + return &dfp[1]; + } + if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) { + dfp[2] = new; + *loghead = 1; + return &dfp[2]; + } + return NULL; +} + +/* + * Remove a bestfree entry from the table. + */ +void +xfs_dir2_data_freeremove( + xfs_dir2_data_t *d, /* data block pointer */ + xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ + int *loghead) /* out: log data header */ +{ +#ifdef __KERNEL__ + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); +#endif + /* + * It's the first entry, slide the next 2 up. + */ + if (dfp == &d->hdr.bestfree[0]) { + d->hdr.bestfree[0] = d->hdr.bestfree[1]; + d->hdr.bestfree[1] = d->hdr.bestfree[2]; + } + /* + * It's the second entry, slide the 3rd entry up. + */ + else if (dfp == &d->hdr.bestfree[1]) + d->hdr.bestfree[1] = d->hdr.bestfree[2]; + /* + * Must be the last entry. + */ + else + ASSERT(dfp == &d->hdr.bestfree[2]); + /* + * Clear the 3rd entry, must be zero now. + */ + d->hdr.bestfree[2].length = 0; + d->hdr.bestfree[2].offset = 0; + *loghead = 1; +} + +/* + * Given a data block, reconstruct its bestfree map. + */ +void +xfs_dir2_data_freescan( + xfs_mount_t *mp, /* filesystem mount point */ + xfs_dir2_data_t *d, /* data block pointer */ + int *loghead, /* out: log data header */ + char *aendp) /* in: caller's endp */ +{ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* active data entry */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + char *endp; /* end of block's data */ + char *p; /* current entry pointer */ + +#ifdef __KERNEL__ + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); +#endif + /* + * Start by clearing the table. + */ + memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree)); + *loghead = 1; + /* + * Set up pointers. + */ + p = (char *)d->u; + if (aendp) + endp = aendp; + else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) { + btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); + endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + } else + endp = (char *)d + mp->m_dirblksize; + /* + * Loop over the block's entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's a free entry, insert it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT((char *)dup - (char *)d == + INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT)); + xfs_dir2_data_freeinsert(d, dup, loghead); + p += INT_GET(dup->length, ARCH_CONVERT); + } + /* + * For active entries, check their tags and skip them. + */ + else { + dep = (xfs_dir2_data_entry_t *)p; + ASSERT((char *)dep - (char *)d == + INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT)); + p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + } + } +} + +/* + * Initialize a data block at the given block number in the directory. + * Give back the buffer for the created block. + */ +int /* error */ +xfs_dir2_data_init( + xfs_da_args_t *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + xfs_dabuf_t **bpp) /* output block buffer */ +{ + xfs_dabuf_t *bp; /* block buffer */ + xfs_dir2_data_t *d; /* pointer to block */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int error; /* error return value */ + int i; /* bestfree index */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + int t; /* temp */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Get the buffer set up for the block. + */ + error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(bp != NULL); + /* + * Initialize the header. + */ + d = bp->data; + INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC); + INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr)); + for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { + d->hdr.bestfree[i].length = 0; + d->hdr.bestfree[i].offset = 0; + } + /* + * Set up an unused entry for the block's body. + */ + dup = &d->u[0].unused; + INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + + t=mp->m_dirblksize - (uint)sizeof(d->hdr); + INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t); + INT_SET(dup->length, ARCH_CONVERT, t); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)dup - (char *)d)); + /* + * Log it and return it. + */ + xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_unused(tp, bp, dup); + *bpp = bp; + return 0; +} + +/* + * Log an active data entry from the block. + */ +void +xfs_dir2_data_log_entry( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_entry_t *dep) /* data entry pointer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), + (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) - + (char *)d - 1)); +} + +/* + * Log a data block header. + */ +void +xfs_dir2_data_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* block buffer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d), + (uint)(sizeof(d->hdr) - 1)); +} + +/* + * Log a data unused entry. + */ +void +xfs_dir2_data_log_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_unused_t *dup) /* data unused pointer */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + + d = bp->data; + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + /* + * Log the first part of the unused entry. + */ + xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d), + (uint)((char *)&dup->length + sizeof(dup->length) - + 1 - (char *)d)); + /* + * Log the end (tag) of the unused entry. + */ + xfs_da_log_buf(tp, bp, + (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d), + (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d + + sizeof(xfs_dir2_data_off_t) - 1)); +} + +/* + * Make a byte range in the data block unused. + * Its current contents are unimportant. + */ +void +xfs_dir2_data_make_free( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* block buffer */ + xfs_dir2_data_aoff_t offset, /* starting byte offset */ + xfs_dir2_data_aoff_t len, /* length in bytes */ + int *needlogp, /* out: log header */ + int *needscanp) /* out: regen bestfree */ +{ + xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + char *endptr; /* end of data area */ + xfs_mount_t *mp; /* filesystem mount point */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *postdup; /* unused entry after us */ + xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + + mp = tp->t_mountp; + d = bp->data; + /* + * Figure out where the end of the data area is. + */ + if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC) + endptr = (char *)d + mp->m_dirblksize; + else { + xfs_dir2_block_tail_t *btp; /* block tail */ + + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); + endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + } + /* + * If this isn't the start of the block, then back up to + * the previous entry and see if it's free. + */ + if (offset > sizeof(d->hdr)) { + xfs_dir2_data_off_t *tagp; /* tag just before us */ + + tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT)); + if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG) + prevdup = NULL; + } else + prevdup = NULL; + /* + * If this isn't the end of the block, see if the entry after + * us is free. + */ + if ((char *)d + offset + len < endptr) { + postdup = + (xfs_dir2_data_unused_t *)((char *)d + offset + len); + if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG) + postdup = NULL; + } else + postdup = NULL; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * Previous and following entries are both free, + * merge everything into a single free entry. + */ + if (prevdup && postdup) { + xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ + + /* + * See if prevdup and/or postdup are in bestfree table. + */ + dfp = xfs_dir2_data_freefind(d, prevdup); + dfp2 = xfs_dir2_data_freefind(d, postdup); + /* + * We need a rescan unless there are exactly 2 free entries + * namely our two. Then we know what's happening, otherwise + * since the third bestfree is there, there might be more + * entries. + */ + needscan = d->hdr.bestfree[2].length; + /* + * Fix up the new big freespace. + */ + INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT)); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)prevdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, prevdup); + if (!needscan) { + /* + * Has to be the case that entries 0 and 1 are + * dfp and dfp2 (don't know which is which), and + * entry 2 is empty. + * Remove entry 1 first then entry 0. + */ + ASSERT(dfp && dfp2); + if (dfp == &d->hdr.bestfree[1]) { + dfp = &d->hdr.bestfree[0]; + ASSERT(dfp2 == dfp); + dfp2 = &d->hdr.bestfree[1]; + } + xfs_dir2_data_freeremove(d, dfp2, needlogp); + xfs_dir2_data_freeremove(d, dfp, needlogp); + /* + * Now insert the new entry. + */ + dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp); + ASSERT(dfp == &d->hdr.bestfree[0]); + ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT)); + ASSERT(!dfp[1].length); + ASSERT(!dfp[2].length); + } + } + /* + * The entry before us is free, merge with it. + */ + else if (prevdup) { + dfp = xfs_dir2_data_freefind(d, prevdup); + INT_MOD(prevdup->length, ARCH_CONVERT, len); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)prevdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, prevdup); + /* + * If the previous entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else + needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT); + } + /* + * The following entry is free, merge with it. + */ + else if (postdup) { + dfp = xfs_dir2_data_freefind(d, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT)); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If the following entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else + needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT); + } + /* + * Neither neighbor is free. Make a new entry. + */ + else { + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + INT_SET(newdup->length, ARCH_CONVERT, len); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup); + (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + } + *needscanp = needscan; +} + +/* + * Take a byte range out of an existing unused space and make it un-free. + */ +void +xfs_dir2_data_use_free( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* data block buffer */ + xfs_dir2_data_unused_t *dup, /* unused entry */ + xfs_dir2_data_aoff_t offset, /* starting offset to use */ + xfs_dir2_data_aoff_t len, /* length to use */ + int *needlogp, /* out: need to log header */ + int *needscanp) /* out: need regen bestfree */ +{ + xfs_dir2_data_t *d; /* data block */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + int matchback; /* matches end of freespace */ + int matchfront; /* matches start of freespace */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + int oldlen; /* old unused entry's length */ + + d = bp->data; + ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC || + INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC); + ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG); + ASSERT(offset >= (char *)dup - (char *)d); + ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d); + ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT)); + /* + * Look up the entry in the bestfree table. + */ + dfp = xfs_dir2_data_freefind(d, dup); + oldlen = INT_GET(dup->length, ARCH_CONVERT); + ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT)); + /* + * Check for alignment with front and back of the entry. + */ + matchfront = (char *)dup - (char *)d == offset; + matchback = (char *)dup + oldlen - (char *)d == offset + len; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * If we matched it exactly we just need to get rid of it from + * the bestfree table. + */ + if (matchfront && matchback) { + if (dfp) { + needscan = d->hdr.bestfree[2].offset; + if (!needscan) + xfs_dir2_data_freeremove(d, dfp, needlogp); + } + } + /* + * We match the first part of the entry. + * Make a new entry with the remaining freespace. + */ + else if (matchfront) { + newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + INT_SET(newdup->length, ARCH_CONVERT, oldlen - len); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + ASSERT(dfp != NULL); + ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT)); + ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &d->hdr.bestfree[2]; + } + } + /* + * We match the last part of the entry. + * Trim the allocated space off the tail of the entry. + */ + else if (matchback) { + newdup = dup; + INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t) + (((char *)d + offset) - (char *)newdup)); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + ASSERT(dfp != NULL); + ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT)); + ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &d->hdr.bestfree[2]; + } + } + /* + * Poking out the middle of an entry. + * Make two new entries. + */ + else { + newdup = dup; + INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t) + (((char *)d + offset) - (char *)newdup)); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup); + newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG); + INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT)); + INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup2), ARCH_CONVERT, + (xfs_dir2_data_off_t)((char *)newdup2 - (char *)d)); + xfs_dir2_data_log_unused(tp, bp, newdup2); + /* + * If the old entry was in the table, we need to scan + * if the 3rd entry was valid, since these entries + * are smaller than the old one. + * If we don't need to scan that means there were 1 or 2 + * entries in the table, and removing the old and adding + * the 2 new will work. + */ + if (dfp) { + needscan = d->hdr.bestfree[2].length; + if (!needscan) { + xfs_dir2_data_freeremove(d, dfp, needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup, + needlogp); + (void)xfs_dir2_data_freeinsert(d, newdup2, + needlogp); + } + } + } + *needscanp = needscan; +} diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h new file mode 100644 index 00000000000..3f02294ccff --- /dev/null +++ b/fs/xfs/xfs_dir2_data.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_DATA_H__ +#define __XFS_DIR2_DATA_H__ + +/* + * Directory format 2, data block structures. + */ + +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_inode; +struct xfs_trans; + +/* + * Constants. + */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */ +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32gb. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_DATA_FIRSTDB(mp) \ + XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET) + +/* + * Offsets of . and .. in data space (always block 0) + */ +#define XFS_DIR2_DATA_DOT_OFFSET \ + ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) +#define XFS_DIR2_DATA_DOTDOT_OFFSET \ + (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1)) +#define XFS_DIR2_DATA_FIRST_OFFSET \ + (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2)) + +/* + * Structures. + */ + +/* + * Describe a free area in the data block. + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + xfs_dir2_data_off_t offset; /* start of freespace */ + xfs_dir2_data_off_t length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * Always at the beginning of a directory-sized block. + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __uint32_t magic; /* XFS_DIR2_DATA_MAGIC */ + /* or XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * Active entry in a data block. Aligned to 8 bytes. + * Tag appears as the last 2 bytes. + */ +typedef struct xfs_dir2_data_entry { + xfs_ino_t inumber; /* inode number */ + __uint8_t namelen; /* name length */ + __uint8_t name[1]; /* name bytes, no null */ + /* variable offset */ + xfs_dir2_data_off_t tag; /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. Aligned to 8 bytes. + * Tag appears as the last 2 bytes. + */ +typedef struct xfs_dir2_data_unused { + __uint16_t freetag; /* XFS_DIR2_DATA_FREE_TAG */ + xfs_dir2_data_off_t length; /* total free length */ + /* variable offset */ + xfs_dir2_data_off_t tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +typedef union { + xfs_dir2_data_entry_t entry; + xfs_dir2_data_unused_t unused; +} xfs_dir2_data_union_t; + +/* + * Generic data block structure, for xfs_db. + */ +typedef struct xfs_dir2_data { + xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */ + xfs_dir2_data_union_t u[1]; +} xfs_dir2_data_t; + +/* + * Macros. + */ + +/* + * Size of a data entry. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTSIZE) +int xfs_dir2_data_entsize(int n); +#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n) +#else +#define XFS_DIR2_DATA_ENTSIZE(n) \ + ((int)(roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ + (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN))) +#endif + +/* + * Pointer to an entry's tag word. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P) +xfs_dir2_data_off_t *xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep); +#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep) +#else +#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) \ + ((xfs_dir2_data_off_t *)\ + ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \ + (uint)sizeof(xfs_dir2_data_off_t))) +#endif + +/* + * Pointer to a freespace's tag word. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P) +xfs_dir2_data_off_t *xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup); +#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \ + xfs_dir2_data_unused_tag_p(dup) +#else +#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \ + ((xfs_dir2_data_off_t *)\ + ((char *)(dup) + INT_GET((dup)->length, ARCH_CONVERT) \ + - (uint)sizeof(xfs_dir2_data_off_t))) +#endif + +/* + * Function declarations. + */ + +#ifdef DEBUG +extern void + xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); +#else +#define xfs_dir2_data_check(dp,bp) +#endif + +extern xfs_dir2_data_free_t * + xfs_dir2_data_freefind(xfs_dir2_data_t *d, + xfs_dir2_data_unused_t *dup); + +extern xfs_dir2_data_free_t * + xfs_dir2_data_freeinsert(xfs_dir2_data_t *d, + xfs_dir2_data_unused_t *dup, int *loghead); + +extern void + xfs_dir2_data_freeremove(xfs_dir2_data_t *d, + xfs_dir2_data_free_t *dfp, int *loghead); + +extern void + xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, + int *loghead, char *aendp); + +extern int + xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_dabuf **bpp); + +extern void + xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_entry_t *dep); + +extern void + xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp); + +extern void + xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_unused_t *dup); + +extern void + xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, + int *needscanp); + +extern void + xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, + xfs_dir2_data_unused_t *dup, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, + int *needscanp); + +#endif /* __XFS_DIR2_DATA_H__ */ diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c new file mode 100644 index 00000000000..262d1e86df3 --- /dev/null +++ b/fs/xfs/xfs_dir2_leaf.c @@ -0,0 +1,1896 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_leaf.c + * XFS directory version 2 implementation - single leaf form + * see xfs_dir2_leaf.h for data structures. + * These directories have multiple XFS_DIR2_DATA blocks and one + * XFS_DIR2_LEAF1 block containing the hash table and freespace map. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_dir2_trace.h" +#include "xfs_error.h" +#include "xfs_bit.h" + +/* + * Local function declarations. + */ +#ifdef DEBUG +static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +#else +#define xfs_dir2_leaf_check(dp, bp) +#endif +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, + int *indexp, xfs_dabuf_t **dbpp); + +/* + * Convert a block form directory to a leaf form directory. + */ +int /* error */ +xfs_dir2_block_to_leaf( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *dbp) /* input block's buffer */ +{ + xfs_dir2_data_off_t *bestsp; /* leaf's bestsp entries */ + xfs_dablk_t blkno; /* leaf block's bno */ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ + xfs_dir2_block_tail_t *btp; /* block's tail */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dabuf_t *lbp; /* leaf block's buffer */ + xfs_dir2_db_t ldb; /* leaf block's bno */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to rescan bestfree */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_b("block_to_leaf", args, dbp); + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Add the leaf block to the inode. + * This interface will only put blocks in the leaf/node range. + * Since that's empty now, we'll get the root (block 0 in range). + */ + if ((error = xfs_da_grow_inode(args, &blkno))) { + return error; + } + ldb = XFS_DIR2_DA_TO_DB(mp, blkno); + ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + /* + * Initialize the leaf block, get a buffer for it. + */ + if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + return error; + } + ASSERT(lbp != NULL); + leaf = lbp->data; + block = dbp->data; + xfs_dir2_data_check(dp, dbp); + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + /* + * Set the counts in the leaf header. + */ + INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */ + INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */ + /* + * Could compact these but I think we always do the conversion + * after squeezing out stale entries. + */ + memcpy(leaf->ents, blp, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1); + needscan = 0; + needlog = 1; + /* + * Make the space formerly occupied by the leaf entries and block + * tail be free. + */ + xfs_dir2_data_make_free(tp, dbp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), + (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize - + (char *)blp), + &needlog, &needscan); + /* + * Fix up the block header, make it a data block. + */ + INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC); + if (needscan) + xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, + NULL); + /* + * Set up leaf tail and bests table. + */ + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + INT_SET(ltp->bestcount, ARCH_CONVERT, 1); + bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT); + /* + * Log the data header and leaf bests table. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_leaf_check(dp, lbp); + xfs_dir2_data_check(dp, dbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_da_buf_done(lbp); + return 0; +} + +/* + * Add an entry to a leaf form directory. + */ +int /* error */ +xfs_dir2_leaf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_data_off_t *bestsp; /* freespace table in leaf */ + int compact; /* need to compact leaves */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry */ + int error; /* error return value */ + int grown; /* allocated new data block */ + int highstale; /* index of next stale leaf */ + int i; /* temporary, index */ + int index; /* leaf table position */ + xfs_dabuf_t *lbp; /* leaf's buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + int lfloglow; /* low leaf logging index */ + int lfloghigh; /* high leaf logging index */ + int lowstale; /* index of prev stale leaf */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + xfs_mount_t *mp; /* filesystem mount point */ + int needbytes; /* leaf block bytes needed */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data free */ + xfs_dir2_data_off_t *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t use_block; /* data block number */ + + xfs_dir2_trace_args("leaf_addname", args); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the leaf block. + */ + error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(lbp != NULL); + /* + * Look up the entry by hash value and name. + * We know it's not there, our caller has already done a lookup. + * So the index is of the entry to insert in front of. + * But if there are dup hash values the index is of the first of those. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + leaf = lbp->data; + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + /* + * See if there are any entries with the same hash value + * and space in their block for the new entry. + * This is good because it puts multiple same-hash value entries + * in a data block, improving the lookup of those entries. + */ + for (use_block = -1, lep = &leaf->ents[index]; + index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval; + index++, lep++) { + if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT)); + ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT)); + ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF); + if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) { + use_block = i; + break; + } + } + /* + * Didn't find a block yet, linear search all the data blocks. + */ + if (use_block == -1) { + for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) { + /* + * Remember a block we see that's missing. + */ + if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1) + use_block = i; + else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) { + use_block = i; + break; + } + } + } + /* + * How many bytes do we need in the leaf block? + */ + needbytes = + (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) + + (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0])); + /* + * Now kill use_block if it refers to a missing block, so we + * can use it as an indication of allocation needed. + */ + if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF) + use_block = -1; + /* + * If we don't have enough free bytes but we can make enough + * by compacting out stale entries, we'll do that. + */ + if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes && + INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) { + compact = 1; + } + /* + * Otherwise if we don't have enough free bytes we need to + * convert to node form. + */ + else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < + needbytes) { + /* + * Just checking or no space reservation, give up. + */ + if (args->justcheck || args->total == 0) { + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOSPC); + } + /* + * Convert to node form. + */ + error = xfs_dir2_leaf_to_node(args, lbp); + xfs_da_buf_done(lbp); + if (error) + return error; + /* + * Then add the new entry. + */ + return xfs_dir2_node_addname(args); + } + /* + * Otherwise it will fit without compaction. + */ + else + compact = 0; + /* + * If just checking, then it will fit unless we needed to allocate + * a new data block. + */ + if (args->justcheck) { + xfs_da_brelse(tp, lbp); + return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; + } + /* + * If no allocations are allowed, return now before we've + * changed anything. + */ + if (args->total == 0 && use_block == -1) { + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOSPC); + } + /* + * Need to compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and we'll shift that one to our insertion + * point later. + */ + if (compact) { + xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, + &lfloglow, &lfloghigh); + } + /* + * There are stale entries, so we'll need log-low and log-high + * impossibly bad values later. + */ + else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) { + lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT); + lfloghigh = -1; + } + /* + * If there was no data block space found, we need to allocate + * a new one. + */ + if (use_block == -1) { + /* + * Add the new data block. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, + &use_block))) { + xfs_da_brelse(tp, lbp); + return error; + } + /* + * Initialize the block. + */ + if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + xfs_da_brelse(tp, lbp); + return error; + } + /* + * If we're adding a new data block on the end we need to + * extend the bests table. Copy it up one entry. + */ + if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) { + bestsp--; + memmove(&bestsp[0], &bestsp[1], + INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0])); + INT_MOD(ltp->bestcount, ARCH_CONVERT, +1); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1); + } + /* + * If we're filling in a previously empty block just log it. + */ + else + xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + data = dbp->data; + INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT); + grown = 1; + } + /* + * Already had space in some data block. + * Just read that one in. + */ + else { + if ((error = + xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block), + -1, &dbp, XFS_DATA_FORK))) { + xfs_da_brelse(tp, lbp); + return error; + } + data = dbp->data; + grown = 0; + } + xfs_dir2_data_check(dp, dbp); + /* + * Point to the biggest freespace in our data block. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT)); + ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length); + needscan = needlog = 0; + /* + * Mark the initial part of our freespace in use for the new entry. + */ + xfs_dir2_data_use_free(tp, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + &needlog, &needscan); + /* + * Initialize our new entry (at last). + */ + dep = (xfs_dir2_data_entry_t *)dup; + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data)); + /* + * Need to scan fix up the bestfree table. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog, NULL); + /* + * Need to log the data block's header. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_entry(tp, dbp, dep); + /* + * If the bests table needs to be changed, do it. + * Log the change unless we've already done that. + */ + if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) { + INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT); + if (!grown) + xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + } + /* + * Now we need to make room to insert the leaf entry. + * If there are no stale entries, we just insert a hole at index. + */ + if (!leaf->hdr.stale) { + /* + * lep is still good as the index leaf entry. + */ + if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + memmove(lep + 1, lep, + (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep)); + /* + * Record low and high logging indices for the leaf. + */ + lfloglow = index; + lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT); + INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1); + } + /* + * There are stale entries. + * We will use one of them for the new entry. + * It's probably not at the right location, so we'll have to + * shift some up or down first. + */ + else { + /* + * If we didn't compact before, we need to find the nearest + * stale entries before and after our insertion point. + */ + if (compact == 0) { + /* + * Find the first stale entry before the insertion + * point, if any. + */ + for (lowstale = index - 1; + lowstale >= 0 && + INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != + XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find the next stale entry at or after the insertion + * point, if any. Stop if we go so far that the + * lowstale entry would be better. + */ + for (highstale = index; + highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) && + INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != + XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || + index - lowstale - 1 >= highstale - index); + highstale++) + continue; + } + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) == + XFS_DIR2_NULL_DATAPTR); + /* + * Copy entries up to cover the stale entry + * and make room for the new entry. + */ + if (index - lowstale - 1 > 0) + memmove(&leaf->ents[lowstale], + &leaf->ents[lowstale + 1], + (index - lowstale - 1) * sizeof(*lep)); + lep = &leaf->ents[index - 1]; + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(index - 1, lfloghigh); + } + /* + * The high one is better, so use that one. + */ + else { + ASSERT(highstale - index >= 0); + ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) == + XFS_DIR2_NULL_DATAPTR); + /* + * Copy entries down to copver the stale entry + * and make room for the new entry. + */ + if (highstale - index > 0) + memmove(&leaf->ents[index + 1], + &leaf->ents[index], + (highstale - index) * sizeof(*lep)); + lep = &leaf->ents[index]; + lfloglow = MIN(index, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1); + } + /* + * Fill in the new leaf entry. + */ + INT_SET(lep->hashval, ARCH_CONVERT, args->hashval); + INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT))); + /* + * Log the leaf fields and give up the buffers. + */ + xfs_dir2_leaf_log_header(tp, lbp); + xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); + xfs_dir2_leaf_check(dp, lbp); + xfs_da_buf_done(lbp); + xfs_dir2_data_check(dp, dbp); + xfs_da_buf_done(dbp); + return 0; +} + +#ifdef DEBUG +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +void +xfs_dir2_leaf_check( + xfs_inode_t *dp, /* incore directory inode */ + xfs_dabuf_t *bp) /* leaf's buffer */ +{ + int i; /* leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + xfs_mount_t *mp; /* filesystem mount point */ + int stale; /* count of stale leaves */ + + leaf = bp->data; + mp = dp->i_mount; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC); + /* + * This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + /* + * Leaves and bests don't overlap. + */ + ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <= + (char *)XFS_DIR2_LEAF_BESTS_P(ltp)); + /* + * Check hash value order, count stale entries. + */ + for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) { + if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <= + INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT)); + if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale); +} +#endif /* DEBUG */ + +/* + * Compact out any stale entries in the leaf. + * Log the header and changed leaf entries, if any. + */ +void +xfs_dir2_leaf_compact( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + int from; /* source leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int loglow; /* first leaf entry to log */ + int to; /* target leaf index */ + + leaf = bp->data; + if (!leaf->hdr.stale) { + return; + } + /* + * Compress out the stale entries in place. + */ + for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) { + if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Only actually copy the entries that are different. + */ + if (from > to) { + if (loglow == -1) + loglow = to; + leaf->ents[to] = leaf->ents[from]; + } + to++; + } + /* + * Update and log the header, log the leaf entries. + */ + ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to); + INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT))); + leaf->hdr.stale = 0; + xfs_dir2_leaf_log_header(args->trans, bp); + if (loglow != -1) + xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); +} + +/* + * Compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and the caller will shift that one to our insertion + * point later. + * Return new insertion index, where the remaining stale entry is, + * and leaf logging indices. + */ +void +xfs_dir2_leaf_compact_x1( + xfs_dabuf_t *bp, /* leaf buffer */ + int *indexp, /* insertion index */ + int *lowstalep, /* out: stale entry before us */ + int *highstalep, /* out: stale entry after us */ + int *lowlogp, /* out: low log index */ + int *highlogp) /* out: high log index */ +{ + int from; /* source copy index */ + int highstale; /* stale entry at/after index */ + int index; /* insertion index */ + int keepstale; /* source index of kept stale */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int lowstale; /* stale entry before index */ + int newindex=0; /* new insertion index */ + int to; /* destination copy index */ + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1); + index = *indexp; + /* + * Find the first stale entry before our index, if any. + */ + for (lowstale = index - 1; + lowstale >= 0 && + INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find the first stale entry at or after our index, if any. + * Stop if the answer would be worse than lowstale. + */ + for (highstale = index; + highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) && + INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || index - lowstale > highstale - index); + highstale++) + continue; + /* + * Pick the better of lowstale and highstale. + */ + if (lowstale >= 0 && + (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) || + index - lowstale <= highstale - index)) + keepstale = lowstale; + else + keepstale = highstale; + /* + * Copy the entries in place, removing all the stale entries + * except keepstale. + */ + for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) { + /* + * Notice the new value of index. + */ + if (index == from) + newindex = to; + if (from != keepstale && + INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) { + if (from == to) + *lowlogp = to; + continue; + } + /* + * Record the new keepstale value for the insertion. + */ + if (from == keepstale) + lowstale = highstale = to; + /* + * Copy only the entries that have moved. + */ + if (from > to) + leaf->ents[to] = leaf->ents[from]; + to++; + } + ASSERT(from > to); + /* + * If the insertion point was past the last entry, + * set the new insertion point accordingly. + */ + if (index == from) + newindex = to; + *indexp = newindex; + /* + * Adjust the leaf header values. + */ + INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to)); + INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1); + /* + * Remember the low/high stale value only in the "right" + * direction. + */ + if (lowstale >= newindex) + lowstale = -1; + else + highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT); + *highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1; + *lowstalep = lowstale; + *highstalep = highstale; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +int /* error */ +xfs_dir2_leaf_getdents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *dp, /* incore directory inode */ + uio_t *uio, /* I/O control & vectors */ + int *eofp, /* out: reached end of dir */ + xfs_dirent_t *dbp, /* caller's buffer */ + xfs_dir2_put_t put) /* ABI formatting routine */ +{ + xfs_dabuf_t *bp; /* data block buffer */ + int byteoff; /* offset in current block */ + xfs_dir2_db_t curdb; /* db for current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int eof; /* reached end of directory */ + int error=0; /* error return value */ + int i; /* temporary loop index */ + int j; /* temporary loop index */ + int length; /* temporary length value */ + xfs_bmbt_irec_t *map; /* map vector for blocks */ + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_put_args_t p; /* formatting arg bundle */ + char *ptr=NULL; /* pointer to current data */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + + /* + * If the offset is at or past the largest allowed value, + * give up right away, return eof. + */ + if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) { + *eofp = 1; + return 0; + } + mp = dp->i_mount; + /* + * Setup formatting arguments. + */ + p.dbp = dbp; + p.put = put; + p.uio = uio; + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + map_size = + howmany(uio->uio_resid + mp->m_dirblksize, + mp->m_sb.sb_blocksize); + map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); + map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; + bp = NULL; + eof = 1; + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset); + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff)); + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + if (bp) { + xfs_da_brelse(tp, bp); + bp = NULL; + map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = MIN((int)map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * + map_valid); + i -= j; + } + } + /* + * Recalculate the readahead blocks wanted. + */ + ra_want = howmany(uio->uio_resid + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + ra_want > map_blocks && + map_off < + XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + nmap = map_size - map_valid; + error = xfs_bmapi(tp, dp, + map_off, + XFS_DIR2_BYTE_TO_DA(mp, + XFS_DIR2_LEAF_OFFSET) - map_off, + XFS_BMAPI_METADATA, NULL, 0, + &map[map_valid], &nmap, NULL); + /* + * Don't know if we should ignore this or + * try to return an error. + * The trouble with returning errors + * is that readdir will just stop without + * actually passing the error through. + */ + if (error) + break; /* XXX */ + /* + * If we got all the mappings we asked for, + * set the final map offset based on the + * last bmap value received. + * Otherwise, we've reached the end. + */ + if (nmap == map_size - map_valid) + map_off = + map[map_valid + nmap - 1].br_startoff + + map[map_valid + nmap - 1].br_blockcount; + else + map_off = + XFS_DIR2_BYTE_TO_DA(mp, + XFS_DIR2_LEAF_OFFSET); + /* + * Look for holes in the mapping, and + * eliminate them. Count up the valid blocks. + */ + for (i = map_valid; i < map_valid + nmap; ) { + if (map[i].br_startblock == + HOLESTARTBLOCK) { + nmap--; + length = map_valid + nmap - i; + if (length) + memmove(&map[i], + &map[i + 1], + sizeof(map[i]) * + length); + } else { + map_blocks += + map[i].br_blockcount; + i++; + } + } + map_valid += nmap; + } + /* + * No valid mappings, so no more data blocks. + */ + if (!map_valid) { + curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off); + break; + } + /* + * Read the directory block starting at the first + * mapping. + */ + curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff); + error = xfs_da_read_buf(tp, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : + -1, + &bp, XFS_DATA_FORK); + /* + * Should just skip over the data block instead + * of giving up. + */ + if (error) + break; /* XXX */ + /* + * Adjust the current amount of read-ahead: we just + * read a block that was previously ra. + */ + if (ra_current) + ra_current -= mp->m_dirblkfsbs; + /* + * Do we need more readahead? + */ + for (ra_index = ra_offset = i = 0; + ra_want > ra_current && i < map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(ra_index < map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > ra_current && + map[ra_index].br_blockcount >= + mp->m_dirblkfsbs) { + xfs_baread(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, + map[ra_index].br_startblock + + ra_offset), + (int)BTOBB(mp->m_dirblksize)); + ra_current = i; + } + /* + * Read-ahead a non-contiguous directory block. + * This doesn't use our mapping, but this + * is a very rare case. + */ + else if (i > ra_current) { + (void)xfs_da_reada_buf(tp, dp, + map[ra_index].br_startoff + + ra_offset, XFS_DATA_FORK); + ra_current = i; + } + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not + * more than a dir block. + */ + length = MIN(mp->m_dirblkfsbs, + (int)(map[ra_index].br_blockcount - + ra_offset)); + j += length; + ra_offset += length; + /* + * Advance to the next mapping if + * this one is used up. + */ + if (ra_offset == + map[ra_index].br_blockcount) { + ra_offset = 0; + ra_index++; + } + } + } + /* + * Having done a read, we need to set a new offset. + */ + newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) == + curdb); + data = bp->data; + xfs_dir2_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)&data->u; + byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += (uint)sizeof(data->hdr); + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)data < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (INT_GET(dup->freetag, ARCH_CONVERT) + == XFS_DIR2_DATA_FREE_TAG) { + + length = INT_GET(dup->length, + ARCH_CONVERT); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + XFS_DIR2_DB_OFF_TO_BYTE(mp, + XFS_DIR2_BYTE_TO_DB(mp, curoff), + (char *)ptr - (char *)data); + if (ptr >= (char *)data + mp->m_dirblksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (INT_GET(dup->freetag, ARCH_CONVERT) + == XFS_DIR2_DATA_FREE_TAG) { + length = INT_GET(dup->length, ARCH_CONVERT); + ptr += length; + curoff += length; + continue; + } + + /* + * Copy the entry into the putargs, and try formatting it. + */ + dep = (xfs_dir2_data_entry_t *)ptr; + + p.namelen = dep->namelen; + + length = XFS_DIR2_DATA_ENTSIZE(p.namelen); + + p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length); + + p.ino = INT_GET(dep->inumber, ARCH_CONVERT); +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = (char *)dep->name; + + error = p.put(&p); + + /* + * Won't fit. Return to caller. + */ + if (!p.done) { + eof = 0; + break; + } + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + } + + /* + * All done. Set output offset value to current offset. + */ + *eofp = eof; + if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR)) + uio->uio_offset = XFS_DIR2_MAX_DATAPTR; + else + uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff); + kmem_free(map, map_size * sizeof(*map)); + if (bp) + xfs_da_brelse(tp, bp); + return error; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +int +xfs_dir2_leaf_init( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_db_t bno, /* directory block number */ + xfs_dabuf_t **bpp, /* out: leaf buffer */ + int magic) /* magic number for block */ +{ + xfs_dabuf_t *bp; /* leaf buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + ASSERT(dp != NULL); + tp = args->trans; + mp = dp->i_mount; + ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && + bno < XFS_DIR2_FREE_FIRSTDB(mp)); + /* + * Get the buffer for the block. + */ + error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp, + XFS_DATA_FORK); + if (error) { + return error; + } + ASSERT(bp != NULL); + leaf = bp->data; + /* + * Initialize the header. + */ + INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic); + leaf->hdr.info.forw = 0; + leaf->hdr.info.back = 0; + leaf->hdr.count = 0; + leaf->hdr.stale = 0; + xfs_dir2_leaf_log_header(tp, bp); + /* + * If it's a leaf-format directory initialize the tail. + * In this case our caller has the real bests table to copy into + * the block. + */ + if (magic == XFS_DIR2_LEAF1_MAGIC) { + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp->bestcount = 0; + xfs_dir2_leaf_log_tail(tp, bp); + } + *bpp = bp; + return 0; +} + +/* + * Log the bests entries indicated from a leaf1 block. + */ +void +xfs_dir2_leaf_log_bests( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_data_off_t *firstb; /* pointer to first entry */ + xfs_dir2_data_off_t *lastb; /* pointer to last entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC); + ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf); + firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first; + lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last; + xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); +} + +/* + * Log the leaf entries indicated from a leaf1 or leafn block. + */ +void +xfs_dir2_leaf_log_ents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ + xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC || + INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + firstlep = &leaf->ents[first]; + lastlep = &leaf->ents[last]; + xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); +} + +/* + * Log the header of the leaf1 or leafn block. + */ +void +xfs_dir2_leaf_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC || + INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), + (uint)(sizeof(leaf->hdr) - 1)); +} + +/* + * Log the tail of the leaf1 block. + */ +void +xfs_dir2_leaf_log_tail( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + + mp = tp->t_mountp; + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC); + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(mp->m_dirblksize - 1)); +} + +/* + * Look up the entry referred to by args in the leaf format directory. + * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which + * is also used by the node-format code. + */ +int +xfs_dir2_leaf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* found entry index */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("leaf_lookup", args); + /* + * Look up name in the leaf block, returning both buffers and index. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + tp = args->trans; + dp = args->dp; + xfs_dir2_leaf_check(dp, lbp); + leaf = lbp->data; + /* + * Get to the leaf entry and contained data entry address. + */ + lep = &leaf->ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->data + + XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT))); + /* + * Return the found inode number. + */ + args->inumber = INT_GET(dep->inumber, ARCH_CONVERT); + xfs_da_brelse(tp, dbp); + xfs_da_brelse(tp, lbp); + return XFS_ERROR(EEXIST); +} + +/* + * Look up name/hash in the leaf block. + * Fill in indexp with the found index, and dbpp with the data buffer. + * If not found dbpp will be NULL, and ENOENT comes back. + * lbpp will always be filled in with the leaf buffer unless there's an error. + */ +static int /* error */ +xfs_dir2_leaf_lookup_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t **lbpp, /* out: leaf buffer */ + int *indexp, /* out: index in leaf block */ + xfs_dabuf_t **dbpp) /* out: data buffer */ +{ + xfs_dir2_db_t curdb; /* current data block number */ + xfs_dabuf_t *dbp; /* data buffer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index in leaf block */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + /* + * Read the leaf block into the buffer. + */ + if ((error = + xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, + XFS_DATA_FORK))) { + return error; + } + *lbpp = lbp; + leaf = lbp->data; + xfs_dir2_leaf_check(dp, lbp); + /* + * Look for the first leaf entry with our hash value. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + /* + * Loop over all the entries with the right hash value + * looking to match the name. + */ + for (lep = &leaf->ents[index], dbp = NULL, curdb = -1; + index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval; + lep++, index++) { + /* + * Skip over stale leaf entries. + */ + if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get the new data block number. + */ + newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT)); + /* + * If it's not the same as the old data block number, + * need to pitch the old one and read the new one. + */ + if (newdb != curdb) { + if (dbp) + xfs_da_brelse(tp, dbp); + if ((error = + xfs_da_read_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp, + XFS_DATA_FORK))) { + xfs_da_brelse(tp, lbp); + return error; + } + xfs_dir2_data_check(dp, dbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->data + + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT))); + /* + * If it matches then return it. + */ + if (dep->namelen == args->namelen && + dep->name[0] == args->name[0] && + memcmp(dep->name, args->name, args->namelen) == 0) { + *dbpp = dbp; + *indexp = index; + return 0; + } + } + /* + * No match found, return ENOENT. + */ + ASSERT(args->oknoent); + if (dbp) + xfs_da_brelse(tp, dbp); + xfs_da_brelse(tp, lbp); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a leaf format directory. + */ +int /* error */ +xfs_dir2_leaf_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_data_off_t *bestsp; /* leaf block best freespace */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t db; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry structure */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_db_t i; /* temporary data block # */ + int index; /* index into leaf entries */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_dir2_data_off_t oldbest; /* old value of best free */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("leaf_removename", args); + /* + * Lookup the leaf entry, get the leaf and data blocks read in. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->data; + data = dbp->data; + xfs_dir2_data_check(dp, dbp); + /* + * Point to the leaf entry, use that to point to the data entry. + */ + lep = &leaf->ents[index]; + db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT)); + dep = (xfs_dir2_data_entry_t *) + ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT))); + needscan = needlog = 0; + oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT); + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest); + /* + * Mark the former data entry unused. + */ + xfs_dir2_data_make_free(tp, dbp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), + XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + /* + * We just mark the leaf entry stale by putting a null in it. + */ + INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1); + xfs_dir2_leaf_log_header(tp, lbp); + INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR); + xfs_dir2_leaf_log_ents(tp, lbp, index, index); + /* + * Scan the freespace in the data block again if necessary, + * log the data block header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog, NULL); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * If the longest freespace in the data block has changed, + * put the new value in the bests table and log that. + */ + if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) { + INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT); + xfs_dir2_leaf_log_bests(tp, lbp, db, db); + } + xfs_dir2_data_check(dp, dbp); + /* + * If the data block is now empty then get rid of the data block. + */ + if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) == + mp->m_dirblksize - (uint)sizeof(data->hdr)) { + ASSERT(db != mp->m_dirdatablk); + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + /* + * Nope, can't get rid of it because it caused + * allocation of a bmap btree block to do so. + * Just go on, returning success, leaving the + * empty block in place. + */ + if (error == ENOSPC && args->total == 0) { + xfs_da_buf_done(dbp); + error = 0; + } + xfs_dir2_leaf_check(dp, lbp); + xfs_da_buf_done(lbp); + return error; + } + dbp = NULL; + /* + * If this is the last data block then compact the + * bests table by getting rid of entries. + */ + if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) { + /* + * Look for the last active entry (i). + */ + for (i = db - 1; i > 0; i--) { + if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF) + break; + } + /* + * Copy the table down so inactive entries at the + * end are removed. + */ + memmove(&bestsp[db - i], bestsp, + (INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp)); + INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i)); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1); + } else + INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF); + } + /* + * If the data block was not the first one, drop it. + */ + else if (db != mp->m_dirdatablk && dbp != NULL) { + xfs_da_buf_done(dbp); + dbp = NULL; + } + xfs_dir2_leaf_check(dp, lbp); + /* + * See if we can convert to block form. + */ + return xfs_dir2_leaf_to_block(args, lbp, dbp); +} + +/* + * Replace the inode number in a leaf format directory entry. + */ +int /* error */ +xfs_dir2_leaf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index of leaf entry */ + xfs_dabuf_t *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args("leaf_replace", args); + /* + * Look up the entry. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->data; + /* + * Point to the leaf entry, get data address from it. + */ + lep = &leaf->ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->data + + XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT))); + ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT)); + /* + * Put the new inode number in, log it. + */ + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + tp = args->trans; + xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_da_buf_done(dbp); + xfs_dir2_leaf_check(dp, lbp); + xfs_da_brelse(tp, lbp); + return 0; +} + +/* + * Return index in the leaf block (lbp) which is either the first + * one with this hash value, or if there are none, the insert point + * for that hash value. + */ +int /* index value */ +xfs_dir2_leaf_search_hash( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp) /* leaf buffer */ +{ + xfs_dahash_t hash=0; /* hash from this entry */ + xfs_dahash_t hashwant; /* hash value looking for */ + int high; /* high leaf index */ + int low; /* low leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int mid=0; /* current leaf index */ + + leaf = lbp->data; +#ifndef __KERNEL__ + if (!leaf->hdr.count) + return 0; +#endif + /* + * Note, the table cannot be empty, so we have to go through the loop. + * Binary search the leaf entries looking for our hash value. + */ + for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1, + hashwant = args->hashval; + low <= high; ) { + mid = (low + high) >> 1; + if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant) + break; + if (hash < hashwant) + low = mid + 1; + else + high = mid - 1; + } + /* + * Found one, back up through all the equal hash values. + */ + if (hash == hashwant) { + while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) { + mid--; + } + } + /* + * Need to point to an entry higher than ours. + */ + else if (hash < hashwant) + mid++; + return mid; +} + +/* + * Trim off a trailing data block. We know it's empty since the leaf + * freespace table says so. + */ +int /* error */ +xfs_dir2_leaf_trim_data( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp, /* leaf buffer */ + xfs_dir2_db_t db) /* data block number */ +{ + xfs_dir2_data_off_t *bestsp; /* leaf bests table */ +#ifdef DEBUG + xfs_dir2_data_t *data; /* data block structure */ +#endif + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Read the offending data block. We need its buffer. + */ + if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp, + XFS_DATA_FORK))) { + return error; + } +#ifdef DEBUG + data = dbp->data; + ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC); +#endif + /* this seems to be an error + * data is only valid if DEBUG is defined? + * RMC 09/08/1999 + */ + + leaf = lbp->data; + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) == + mp->m_dirblksize - (uint)sizeof(data->hdr)); + ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1); + /* + * Get rid of the data block. + */ + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + ASSERT(error != ENOSPC); + xfs_da_brelse(tp, dbp); + return error; + } + /* + * Eliminate the last bests entry from the table. + */ + bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + INT_MOD(ltp->bestcount, ARCH_CONVERT, -1); + memmove(&bestsp[1], &bestsp[0], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp)); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1); + return 0; +} + +/* + * Convert node form directory to leaf form directory. + * The root of the node form dir needs to already be a LEAFN block. + * Just return if we can't do anything. + */ +int /* error */ +xfs_dir2_node_to_leaf( + xfs_da_state_t *state) /* directory operation state */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dabuf_t *fbp; /* buffer for freespace block */ + xfs_fileoff_t fo; /* freespace file offset */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_dabuf_t *lbp; /* buffer for leaf block */ + xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* successful free trim? */ + xfs_trans_t *tp; /* transaction pointer */ + + /* + * There's more than a leaf level in the btree, so there must + * be multiple leafn blocks. Give up. + */ + if (state->path.active > 1) + return 0; + args = state->args; + xfs_dir2_trace_args("node_to_leaf", args); + mp = state->mp; + dp = args->dp; + tp = args->trans; + /* + * Get the last offset in the file. + */ + if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + fo -= mp->m_dirblkfsbs; + /* + * If there are freespace blocks other than the first one, + * take this opportunity to remove trailing empty freespace blocks + * that may have been left behind during no-space-reservation + * operations. + */ + while (fo > mp->m_dirfreeblk) { + if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { + return error; + } + if (rval) + fo -= mp->m_dirblkfsbs; + else + return 0; + } + /* + * Now find the block just before the freespace block. + */ + if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + /* + * If it's not the single leaf block, give up. + */ + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + return 0; + lbp = state->path.blk[0].bp; + leaf = lbp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + /* + * Read the freespace block. + */ + if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, + XFS_DATA_FORK))) { + return error; + } + free = fbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + ASSERT(!free->hdr.firstdb); + /* + * Now see if the leafn and free data will fit in a leaf1. + * If not, release the buffer and give up. + */ + if ((uint)sizeof(leaf->hdr) + + (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) + + INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) + + (uint)sizeof(leaf->tail) > + mp->m_dirblksize) { + xfs_da_brelse(tp, fbp); + return 0; + } + /* + * If the leaf has any stale entries in it, compress them out. + * The compact routine will log the header. + */ + if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) + xfs_dir2_leaf_compact(args, lbp); + else + xfs_dir2_leaf_log_header(tp, lbp); + INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC); + /* + * Set up the leaf tail from the freespace block. + */ + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT); + /* + * Set up the leaf bests table. + */ + memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests, + INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0])); + xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1); + xfs_dir2_leaf_log_tail(tp, lbp); + xfs_dir2_leaf_check(dp, lbp); + /* + * Get rid of the freespace block. + */ + error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + if (error) { + /* + * This can't fail here because it can only happen when + * punching out the middle of an extent, and this is an + * isolated block. + */ + ASSERT(error != ENOSPC); + return error; + } + fbp = NULL; + /* + * Now see if we can convert the single-leaf directory + * down to a block form directory. + * This routine always kills the dabuf for the leaf, so + * eliminate it from the path. + */ + error = xfs_dir2_leaf_to_block(args, lbp, NULL); + state->path.blk[0].bp = NULL; + return error; +} diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h new file mode 100644 index 00000000000..7f20eee56a5 --- /dev/null +++ b/fs/xfs/xfs_dir2_leaf.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_LEAF_H__ +#define __XFS_DIR2_LEAF_H__ + +/* + * Directory version 2, leaf block structures. + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Constants. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_LEAF_FIRSTDB(mp) \ + XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET) + +/* + * Types. + */ + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Structures. + */ + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __uint16_t count; /* count of entries */ + __uint16_t stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + xfs_dahash_t hashval; /* hash value of name */ + xfs_dir2_dataptr_t address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __uint32_t bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + * bests and tail are at the end of the block for single-leaf only + * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC). + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t ents[1]; /* entries */ + /* ... */ + xfs_dir2_data_off_t bests[1]; /* best free counts */ + xfs_dir2_leaf_tail_t tail; /* leaf tail */ +} xfs_dir2_leaf_t; + +/* + * Macros. + * The DB blocks are logical directory block numbers, not filesystem blocks. + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_MAX_LEAF_ENTS) +int +xfs_dir2_max_leaf_ents(struct xfs_mount *mp); +#define XFS_DIR2_MAX_LEAF_ENTS(mp) \ + xfs_dir2_max_leaf_ents(mp) +#else +#define XFS_DIR2_MAX_LEAF_ENTS(mp) \ + ((int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / \ + (uint)sizeof(xfs_dir2_leaf_entry_t))) +#endif + +/* + * Get address of the bestcount field in the single-leaf block. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_TAIL_P) +xfs_dir2_leaf_tail_t * +xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp); +#define XFS_DIR2_LEAF_TAIL_P(mp,lp) \ + xfs_dir2_leaf_tail_p(mp, lp) +#else +#define XFS_DIR2_LEAF_TAIL_P(mp,lp) \ + ((xfs_dir2_leaf_tail_t *)\ + ((char *)(lp) + (mp)->m_dirblksize - \ + (uint)sizeof(xfs_dir2_leaf_tail_t))) +#endif + +/* + * Get address of the bests array in the single-leaf block. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_BESTS_P) +xfs_dir2_data_off_t * +xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp); +#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp) +#else +#define XFS_DIR2_LEAF_BESTS_P(ltp) \ + ((xfs_dir2_data_off_t *)(ltp) - INT_GET((ltp)->bestcount, ARCH_CONVERT)) +#endif + +/* + * Convert dataptr to byte in file space + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE) +xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp); +#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp) +#else +#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) \ + ((xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG) +#endif + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR) +xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by); +#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by) +#else +#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) \ + ((xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG)) +#endif + +/* + * Convert dataptr to a block number + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_DB) +xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp); +#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp) +#else +#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) \ + XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)) +#endif + +/* + * Convert dataptr to a byte offset in a block + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_OFF) +xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp); +#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp) +#else +#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) \ + XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)) +#endif + +/* + * Convert block and offset to byte in space + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE) +xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o); +#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \ + xfs_dir2_db_off_to_byte(mp, db, o) +#else +#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \ + (((xfs_dir2_off_t)(db) << \ + ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o)) +#endif + +/* + * Convert byte in space to (DB) block + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DB) +xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by); +#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by) +#else +#define XFS_DIR2_BYTE_TO_DB(mp,by) \ + ((xfs_dir2_db_t)((by) >> \ + ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog))) +#endif + +/* + * Convert byte in space to (DA) block + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DA) +xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by); +#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by) +#else +#define XFS_DIR2_BYTE_TO_DA(mp,by) \ + XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by)) +#endif + +/* + * Convert byte in space to offset in a block + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_OFF) +xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by); +#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by) +#else +#define XFS_DIR2_BYTE_TO_OFF(mp,by) \ + ((xfs_dir2_data_aoff_t)((by) & \ + ((1 << ((mp)->m_sb.sb_blocklog + \ + (mp)->m_sb.sb_dirblklog)) - 1))) +#endif + +/* + * Convert block and offset to dataptr + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR) +xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o); +#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \ + xfs_dir2_db_off_to_dataptr(mp, db, o) +#else +#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \ + XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o)) +#endif + +/* + * Convert block (DB) to block (dablk) + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_DA) +xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db); +#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db) +#else +#define XFS_DIR2_DB_TO_DA(mp,db) \ + ((xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog)) +#endif + +/* + * Convert block (dablk) to block (DB) + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_DB) +xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da); +#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da) +#else +#define XFS_DIR2_DA_TO_DB(mp,da) \ + ((xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog)) +#endif + +/* + * Convert block (dablk) to byte offset in space + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_BYTE) +xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da); +#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da) +#else +#define XFS_DIR2_DA_TO_BYTE(mp,da) \ + XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0) +#endif + +/* + * Function declarations. + */ + +extern int + xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_dabuf *dbp); + +extern int + xfs_dir2_leaf_addname(struct xfs_da_args *args); + +extern void + xfs_dir2_leaf_compact(struct xfs_da_args *args, struct xfs_dabuf *bp); + +extern void + xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, + int *lowstalep, int *highstalep, int *lowlogp, + int *highlogp); + +extern int + xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp, + struct uio *uio, int *eofp, struct xfs_dirent *dbp, + xfs_dir2_put_t put); + +extern int + xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_dabuf **bpp, int magic); + +extern void + xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); + +extern void + xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); + +extern void + xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp); + +extern void + xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); + +extern int + xfs_dir2_leaf_lookup(struct xfs_da_args *args); + +extern int + xfs_dir2_leaf_removename(struct xfs_da_args *args); + +extern int + xfs_dir2_leaf_replace(struct xfs_da_args *args); + +extern int + xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_dabuf *lbp); +extern int + xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_dabuf *lbp, xfs_dir2_db_t db); + +extern int + xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +#endif /* __XFS_DIR2_LEAF_H__ */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c new file mode 100644 index 00000000000..a7615d86bfb --- /dev/null +++ b/fs/xfs/xfs_dir2_node.c @@ -0,0 +1,2020 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_node.c + * XFS directory implementation, version 2, node form files + * See data structures in xfs_dir2_node.h and xfs_da_btree.h. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_node.h" +#include "xfs_dir2_trace.h" +#include "xfs_error.h" + +/* + * Function declarations. + */ +static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); +static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); +#ifdef DEBUG +static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +#else +#define xfs_dir2_leafn_check(dp, bp) +#endif +static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, + int start_s, xfs_dabuf_t *bp_d, int start_d, + int count); +static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, + int index, xfs_da_state_blk_t *dblk, + int *rval); +static int xfs_dir2_node_addname_int(xfs_da_args_t *args, + xfs_da_state_blk_t *fblk); + +/* + * Log entries from a freespace block. + */ +void +xfs_dir2_free_log_bests( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp, /* freespace buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + xfs_da_log_buf(tp, bp, + (uint)((char *)&free->bests[first] - (char *)free), + (uint)((char *)&free->bests[last] - (char *)free + + sizeof(free->bests[0]) - 1)); +} + +/* + * Log header from a freespace block. + */ +static void +xfs_dir2_free_log_header( + xfs_trans_t *tp, /* transaction pointer */ + xfs_dabuf_t *bp) /* freespace buffer */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), + (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); +} + +/* + * Convert a leaf-format directory to a node-format directory. + * We need to change the magic number of the leaf block, and copy + * the freespace table out of the leaf block into its own block. + */ +int /* error */ +xfs_dir2_leaf_to_node( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *lbp) /* leaf buffer */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dabuf_t *fbp; /* freespace buffer */ + xfs_dir2_db_t fdb; /* freespace block number */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_dir2_data_off_t *from; /* pointer to freespace entry */ + int i; /* leaf freespace index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int n; /* count of live freespc ents */ + xfs_dir2_data_off_t off; /* freespace entry value */ + xfs_dir2_data_off_t *to; /* pointer to freespace entry */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_b("leaf_to_node", args, lbp); + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Add a freespace block to the directory. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { + return error; + } + ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + /* + * Get the buffer for the new freespace block. + */ + if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp, + XFS_DATA_FORK))) { + return error; + } + ASSERT(fbp != NULL); + free = fbp->data; + leaf = lbp->data; + ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + /* + * Initialize the freespace block header. + */ + INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC); + free->hdr.firstdb = 0; + ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize); + INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT); + /* + * Copy freespace entries from the leaf block to the new block. + * Count active entries. + */ + for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests; + i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) { + if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF) + n++; + INT_SET(*to, ARCH_CONVERT, off); + } + INT_SET(free->hdr.nused, ARCH_CONVERT, n); + INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC); + /* + * Log everything. + */ + xfs_dir2_leaf_log_header(tp, lbp); + xfs_dir2_free_log_header(tp, fbp); + xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1); + xfs_da_buf_done(fbp); + xfs_dir2_leafn_check(dp, lbp); + return 0; +} + +/* + * Add a leaf entry to a leaf block in a node-form directory. + * The other work necessary is done from the caller. + */ +static int /* error */ +xfs_dir2_leafn_add( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int index) /* insertion pt for new entry */ +{ + int compact; /* compacting stale leaves */ + xfs_inode_t *dp; /* incore directory inode */ + int highstale; /* next stale entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int lfloghigh; /* high leaf entry logging */ + int lfloglow; /* low leaf entry logging */ + int lowstale; /* previous stale entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_sb("leafn_add", args, index, bp); + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + leaf = bp->data; + + /* + * Quick check just to make sure we are not going to index + * into other peoples memory + */ + if (index < 0) + return XFS_ERROR(EFSCORRUPTED); + + /* + * If there are already the maximum number of leaf entries in + * the block, if there are no stale entries it won't fit. + * Caller will do a split. If there are stale entries we'll do + * a compact. + */ + + if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) { + if (!leaf->hdr.stale) + return XFS_ERROR(ENOSPC); + compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1; + } else + compact = 0; + ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval); + ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || + INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval); + + if (args->justcheck) + return 0; + + /* + * Compact out all but one stale leaf entry. Leaves behind + * the entry closest to index. + */ + if (compact) { + xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, + &lfloglow, &lfloghigh); + } + /* + * Set impossible logging indices for this case. + */ + else if (leaf->hdr.stale) { + lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT); + lfloghigh = -1; + } + /* + * No stale entries, just insert a space for the new entry. + */ + if (!leaf->hdr.stale) { + lep = &leaf->ents[index]; + if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT)) + memmove(lep + 1, lep, + (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep)); + lfloglow = index; + lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT); + INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1); + } + /* + * There are stale entries. We'll use one for the new entry. + */ + else { + /* + * If we didn't do a compact then we need to figure out + * which stale entry will be used. + */ + if (compact == 0) { + /* + * Find first stale entry before our insertion point. + */ + for (lowstale = index - 1; + lowstale >= 0 && + INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != + XFS_DIR2_NULL_DATAPTR; + lowstale--) + continue; + /* + * Find next stale entry after insertion point. + * Stop looking if the answer would be worse than + * lowstale already found. + */ + for (highstale = index; + highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) && + INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != + XFS_DIR2_NULL_DATAPTR && + (lowstale < 0 || + index - lowstale - 1 >= highstale - index); + highstale++) + continue; + } + /* + * Using the low stale entry. + * Shift entries up toward the stale slot. + */ + if (lowstale >= 0 && + (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) || + index - lowstale - 1 < highstale - index)) { + ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) == + XFS_DIR2_NULL_DATAPTR); + ASSERT(index - lowstale - 1 >= 0); + if (index - lowstale - 1 > 0) + memmove(&leaf->ents[lowstale], + &leaf->ents[lowstale + 1], + (index - lowstale - 1) * sizeof(*lep)); + lep = &leaf->ents[index - 1]; + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(index - 1, lfloghigh); + } + /* + * Using the high stale entry. + * Shift entries down toward the stale slot. + */ + else { + ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) == + XFS_DIR2_NULL_DATAPTR); + ASSERT(highstale - index >= 0); + if (highstale - index > 0) + memmove(&leaf->ents[index + 1], + &leaf->ents[index], + (highstale - index) * sizeof(*lep)); + lep = &leaf->ents[index]; + lfloglow = MIN(index, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1); + } + /* + * Insert the new entry, log everything. + */ + INT_SET(lep->hashval, ARCH_CONVERT, args->hashval); + INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index)); + xfs_dir2_leaf_log_header(tp, bp); + xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); + xfs_dir2_leafn_check(dp, bp); + return 0; +} + +#ifdef DEBUG +/* + * Check internal consistency of a leafn block. + */ +void +xfs_dir2_leafn_check( + xfs_inode_t *dp, /* incore directory inode */ + xfs_dabuf_t *bp) /* leaf buffer */ +{ + int i; /* leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int stale; /* count of stale leaves */ + + leaf = bp->data; + mp = dp->i_mount; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); + for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) { + if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) { + ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <= + INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT)); + } + if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale); +} +#endif /* DEBUG */ + +/* + * Return the last hash value in the leaf. + * Stale entries are ok. + */ +xfs_dahash_t /* hash value */ +xfs_dir2_leafn_lasthash( + xfs_dabuf_t *bp, /* leaf buffer */ + int *count) /* count of entries in leaf */ +{ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + if (count) + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + if (!leaf->hdr.count) + return 0; + return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + xfs_dabuf_t *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + xfs_dabuf_t *curbp; /* current data/free buffer */ + xfs_dir2_db_t curdb; /* current data block number */ + xfs_dir2_db_t curfdb; /* current free block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int fi; /* free entry index */ + xfs_dir2_free_t *free=NULL; /* free block structure */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length=0; /* length of new data entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_dir2_db_t newfdb; /* new free block number */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); +#ifdef __KERNEL__ + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0); +#endif + xfs_dir2_leafn_check(dp, bp); + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) + curbp = state->extrablk.bp; + else + curbp = NULL; + /* + * For addname, it's a free block buffer, get the block number. + */ + if (args->addname) { + curfdb = curbp ? state->extrablk.blkno : -1; + curdb = -1; + length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + if ((free = (curbp ? curbp->data : NULL))) + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + } + /* + * For others, it's a data block buffer, get the block number. + */ + else { + curfdb = -1; + curdb = curbp ? state->extrablk.blkno : -1; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &leaf->ents[index]; + index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT)); + /* + * For addname, we're looking for a place to put the new entry. + * We want to use a data block with an entry of equal + * hash value to ours if there is one with room. + */ + if (args->addname) { + /* + * If this block isn't the data block we already have + * in hand, take a look at it. + */ + if (newdb != curdb) { + curdb = newdb; + /* + * Convert the data block to the free block + * holding its freespace information. + */ + newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb); + /* + * If it's not the one we have in hand, + * read it in. + */ + if (newfdb != curfdb) { + /* + * If we had one before, drop it. + */ + if (curbp) + xfs_da_brelse(tp, curbp); + /* + * Read the free block. + */ + if ((error = xfs_da_read_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, + newfdb), + -1, &curbp, + XFS_DATA_FORK))) { + return error; + } + curfdb = newfdb; + free = curbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == + XFS_DIR2_FREE_MAGIC); + ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) % + XFS_DIR2_MAX_FREE_BESTS(mp)) == + 0); + ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb); + ASSERT(curdb < + INT_GET(free->hdr.firstdb, ARCH_CONVERT) + + INT_GET(free->hdr.nvalid, ARCH_CONVERT)); + } + /* + * Get the index for our entry. + */ + fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb); + /* + * If it has room, return it. + */ + if (unlikely(INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF)) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) { + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.blkno = curfdb; + state->extrablk.index = fi; + state->extrablk.magic = + XFS_DIR2_FREE_MAGIC; + ASSERT(args->oknoent); + return XFS_ERROR(ENOENT); + } + } + } + /* + * Not adding a new entry, so we really want to find + * the name given to us. + */ + else { + /* + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before, drop it. + */ + if (curbp) + xfs_da_brelse(tp, curbp); + /* + * Read the data block. + */ + if ((error = + xfs_da_read_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, newdb), -1, + &curbp, XFS_DATA_FORK))) { + return error; + } + xfs_dir2_data_check(dp, curbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)curbp->data + + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT))); + /* + * Compare the entry, return it if it matches. + */ + if (dep->namelen == args->namelen && + dep->name[0] == args->name[0] && + memcmp(dep->name, args->name, args->namelen) == 0) { + args->inumber = INT_GET(dep->inumber, ARCH_CONVERT); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.blkno = curdb; + state->extrablk.index = + (int)((char *)dep - + (char *)curbp->data); + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + return XFS_ERROR(EEXIST); + } + } + } + /* + * Didn't find a match. + * If we are holding a buffer, give it back in case our caller + * finds it useful. + */ + if ((state->extravalid = (curbp != NULL))) { + state->extrablk.bp = curbp; + state->extrablk.index = -1; + /* + * For addname, giving back a free block. + */ + if (args->addname) { + state->extrablk.blkno = curfdb; + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } + /* + * For other callers, giving back a data block. + */ + else { + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + } + } + /* + * Return the final index, that will be the insertion point. + */ + *indexp = index; + ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent); + return XFS_ERROR(ENOENT); +} + +/* + * Move count leaf entries from source to destination leaf. + * Log entries and headers. Stale entries are preserved. + */ +static void +xfs_dir2_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp_s, /* source leaf buffer */ + int start_s, /* source leaf index */ + xfs_dabuf_t *bp_d, /* destination leaf buffer */ + int start_d, /* destination leaf index */ + int count) /* count of leaves to copy */ +{ + xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ + xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ + int stale; /* count stale leaves copied */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d, + start_d, count); + /* + * Silently return if nothing to do. + */ + if (count == 0) { + return; + } + tp = args->trans; + leaf_s = bp_s->data; + leaf_d = bp_d->data; + /* + * If the destination index is not the end of the current + * destination leaf entries, open up a hole in the destination + * to hold the new entries. + */ + if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) { + memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], + (INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) * + sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, + count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1); + } + /* + * If the source has stale leaves, count the ones in the copy range + * so we can update the header correctly. + */ + if (leaf_s->hdr.stale) { + int i; /* temp leaf index */ + + for (i = start_s, stale = 0; i < start_s + count; i++) { + if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) + stale++; + } + } else + stale = 0; + /* + * Copy the leaf entries from source to destination. + */ + memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + /* + * If there are source entries after the ones we copied, + * delete the ones we copied by sliding the next ones down. + */ + if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) { + memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + } + /* + * Update the headers and log them. + */ + INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count)); + INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale)); + INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count); + INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale); + xfs_dir2_leaf_log_header(tp, bp_s); + xfs_dir2_leaf_log_header(tp, bp_d); + xfs_dir2_leafn_check(args->dp, bp_s); + xfs_dir2_leafn_check(args->dp, bp_d); +} + +/* + * Determine the sort order of two leaf blocks. + * Returns 1 if both are valid and leaf2 should be before leaf1, else 0. + */ +int /* sort order */ +xfs_dir2_leafn_order( + xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ + xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ +{ + xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ + xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 && + INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 && + (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) || + INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) < + INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT))) + return 1; + return 0; +} + +/* + * Rebalance leaf entries between two leaf blocks. + * This is actually only called when the second block is new, + * though the code deals with the general case. + * A new entry will be inserted in one of the blocks, and that + * entry is taken into account when balancing. + */ +static void +xfs_dir2_leafn_rebalance( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *blk1, /* first btree block */ + xfs_da_state_blk_t *blk2) /* second btree block */ +{ + xfs_da_args_t *args; /* operation arguments */ + int count; /* count (& direction) leaves */ + int isleft; /* new goes in left leaf */ + xfs_dir2_leaf_t *leaf1; /* first leaf structure */ + xfs_dir2_leaf_t *leaf2; /* second leaf structure */ + int mid; /* midpoint leaf index */ +#ifdef DEBUG + int oldstale; /* old count of stale leaves */ +#endif + int oldsum; /* old total leaf count */ + int swap; /* swapped leaf blocks */ + + args = state->args; + /* + * If the block order is wrong, swap the arguments. + */ + if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) { + xfs_da_state_blk_t *tmp; /* temp for block swap */ + + tmp = blk1; + blk1 = blk2; + blk2 = tmp; + } + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT); +#ifdef DEBUG + oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT); +#endif + mid = oldsum >> 1; + /* + * If the old leaf count was odd then the new one will be even, + * so we need to divide the new count evenly. + */ + if (oldsum & 1) { + xfs_dahash_t midhash; /* middle entry hash value */ + + if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT)) + midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT); + else + midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT); + isleft = args->hashval <= midhash; + } + /* + * If the old count is even then the new count is odd, so there's + * no preferred side for the new entry. + * Pick the left one. + */ + else + isleft = 1; + /* + * Calculate moved entry count. Positive means left-to-right, + * negative means right-to-left. Then move the entries. + */ + count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0); + if (count > 0) + xfs_dir2_leafn_moveents(args, blk1->bp, + INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count); + else if (count < 0) + xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, + INT_GET(leaf1->hdr.count, ARCH_CONVERT), count); + ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum); + ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale); + /* + * Mark whether we're inserting into the old or new leaf. + */ + if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT)) + state->inleaf = swap; + else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT)) + state->inleaf = !swap; + else + state->inleaf = + swap ^ (blk1->index <= INT_GET(leaf1->hdr.count, ARCH_CONVERT)); + /* + * Adjust the expected index for insertion. + */ + if (!state->inleaf) + blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + + /* + * Finally sanity check just to make sure we are not returning a negative index + */ + if(blk2->index < 0) { + state->inleaf = 1; + blk2->index = 0; + cmn_err(CE_ALERT, + "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting orignal leaf: " + "blk1->index %d\n", + blk1->index); + } +} + +/* + * Remove an entry from a node directory. + * This removes the leaf entry and the data entry, + * and updates the free block if necessary. + */ +static int /* error */ +xfs_dir2_leafn_remove( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp, /* leaf buffer */ + int index, /* leaf entry index */ + xfs_da_state_blk_t *dblk, /* data block */ + int *rval) /* resulting block needs join */ +{ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t db; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int longest; /* longest data free entry */ + int off; /* data block entry offset */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_trans_t *tp; /* transaction pointer */ + + xfs_dir2_trace_args_sb("leafn_remove", args, index, bp); + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + /* + * Point to the entry we're removing. + */ + lep = &leaf->ents[index]; + /* + * Extract the data block and offset from the entry. + */ + db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT)); + ASSERT(dblk->blkno == db); + off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)); + ASSERT(dblk->index == off); + /* + * Kill the leaf entry by marking it stale. + * Log the leaf block changes. + */ + INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1); + xfs_dir2_leaf_log_header(tp, bp); + INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR); + xfs_dir2_leaf_log_ents(tp, bp, index, index); + /* + * Make the data entry free. Keep track of the longest freespace + * in the data block in case it changes. + */ + dbp = dblk->bp; + data = dbp->data; + dep = (xfs_dir2_data_entry_t *)((char *)data + off); + longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT); + needlog = needscan = 0; + xfs_dir2_data_make_free(tp, dbp, off, + XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + /* + * Rescan the data block freespaces for bestfree. + * Log the data block header if needed. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog, NULL); + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_check(dp, dbp); + /* + * If the longest data block freespace changes, need to update + * the corresponding freeblock entry. + */ + if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) { + int error; /* error return value */ + xfs_dabuf_t *fbp; /* freeblock buffer */ + xfs_dir2_db_t fdb; /* freeblock block number */ + int findex; /* index in freeblock entries */ + xfs_dir2_free_t *free; /* freeblock structure */ + int logfree; /* need to log free entry */ + + /* + * Convert the data block number to a free block, + * read in the free block. + */ + fdb = XFS_DIR2_DB_TO_FDB(mp, db); + if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), + -1, &fbp, XFS_DATA_FORK))) { + return error; + } + free = fbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) == + XFS_DIR2_MAX_FREE_BESTS(mp) * + (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + /* + * Calculate which entry we need to fix. + */ + findex = XFS_DIR2_DB_TO_FDINDEX(mp, db); + longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT); + /* + * If the data block is now empty we can get rid of it + * (usually). + */ + if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) { + /* + * Try to punch out the data block. + */ + error = xfs_dir2_shrink_inode(args, db, dbp); + if (error == 0) { + dblk->bp = NULL; + data = NULL; + } + /* + * We can get ENOSPC if there's no space reservation. + * In this case just drop the buffer and some one else + * will eventually get rid of the empty block. + */ + else if (error == ENOSPC && args->total == 0) + xfs_da_buf_done(dbp); + else + return error; + } + /* + * If we got rid of the data block, we can eliminate that entry + * in the free block. + */ + if (data == NULL) { + /* + * One less used entry in the free table. + */ + INT_MOD(free->hdr.nused, ARCH_CONVERT, -1); + xfs_dir2_free_log_header(tp, fbp); + /* + * If this was the last entry in the table, we can + * trim the table size back. There might be other + * entries at the end referring to non-existent + * data blocks, get those too. + */ + if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; + i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF; + i--) + continue; + INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1); + logfree = 0; + } + /* + * Not the last entry, just punch it out. + */ + else { + INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } + /* + * Data block is not empty, just set the free entry to + * the new value. + */ + else { + INT_SET(free->bests[findex], ARCH_CONVERT, longest); + logfree = 1; + } + /* + * Log the free entry that changed, unless we got rid of it. + */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + /* + * Drop the buffer if we still have it. + */ + if (fbp) + xfs_da_buf_done(fbp); + } + xfs_dir2_leafn_check(dp, bp); + /* + * Return indication of whether this leaf block is emtpy enough + * to justify trying to join it with a neighbor. + */ + *rval = + ((uint)sizeof(leaf->hdr) + + (uint)sizeof(leaf->ents[0]) * + (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) < + mp->m_dir_magicpct; + return 0; +} + +/* + * Split the leaf entries in the old block into old and new blocks. + */ +int /* error */ +xfs_dir2_leafn_split( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *oldblk, /* original block */ + xfs_da_state_blk_t *newblk) /* newly created block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dablk_t blkno; /* new leaf block number */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + + /* + * Allocate space for a new leaf node. + */ + args = state->args; + mp = args->dp->i_mount; + ASSERT(args != NULL); + ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); + error = xfs_da_grow_inode(args, &blkno); + if (error) { + return error; + } + /* + * Initialize the new leaf block. + */ + error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) { + return error; + } + newblk->blkno = blkno; + newblk->magic = XFS_DIR2_LEAFN_MAGIC; + /* + * Rebalance the entries across the two leaves, link the new + * block into the leaves. + */ + xfs_dir2_leafn_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) { + return error; + } + /* + * Insert the new entry in the correct block. + */ + if (state->inleaf) + error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index); + else + error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index); + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); + xfs_dir2_leafn_check(args->dp, oldblk->bp); + xfs_dir2_leafn_check(args->dp, newblk->bp); + return error; +} + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +int /* error */ +xfs_dir2_leafn_toosmall( + xfs_da_state_t *state, /* btree cursor */ + int *action) /* resulting action to take */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dablk_t blkno; /* leaf block number */ + xfs_dabuf_t *bp; /* leaf buffer */ + int bytes; /* bytes in use */ + int count; /* leaf live entry count */ + int error; /* error return value */ + int forward; /* sibling block direction */ + int i; /* sibling counter */ + xfs_da_blkinfo_t *info; /* leaf block header */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int rval; /* result from path_shift */ + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[state->path.active - 1]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + leaf = (xfs_dir2_leaf_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT); + bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); + if (bytes > (state->blocksize >> 1)) { + /* + * Blk over 50%, don't try to join. + */ + *action = 0; + return 0; + } + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = info->forw; + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, 0, + &rval); + if (error) + return error; + *action = rval ? 2 : 0; + return 0; + } + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT); + for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { + blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + /* + * Read the sibling leaf block. + */ + if ((error = + xfs_da_read_buf(state->args->trans, state->args->dp, blkno, + -1, &bp, XFS_DATA_FORK))) { + return error; + } + ASSERT(bp != NULL); + /* + * Count bytes in the two blocks combined. + */ + leaf = (xfs_dir2_leaf_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT); + bytes = state->blocksize - (state->blocksize >> 2); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT); + bytes -= count * (uint)sizeof(leaf->ents[0]); + /* + * Fits with at least 25% to spare. + */ + if (bytes >= 0) + break; + xfs_da_brelse(state->args->trans, bp); + } + /* + * Didn't like either block, give up. + */ + if (i >= 2) { + *action = 0; + return 0; + } + /* + * Done with the sibling leaf block here, drop the dabuf + * so path_shift can get it. + */ + xfs_da_buf_done(bp); + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) + error = xfs_da_path_shift(state, &state->altpath, forward, 0, + &rval); + else + error = xfs_da_path_shift(state, &state->path, forward, 0, + &rval); + if (error) { + return error; + } + *action = rval ? 0 : 1; + return 0; +} + +/* + * Move all the leaf entries from drop_blk to save_blk. + * This is done as part of a join operation. + */ +void +xfs_dir2_leafn_unbalance( + xfs_da_state_t *state, /* cursor */ + xfs_da_state_blk_t *drop_blk, /* dead block */ + xfs_da_state_blk_t *save_blk) /* surviving block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ + xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + + args = state->args; + ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC); + /* + * If there are any stale leaf entries, take this opportunity + * to purge them. + */ + if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT)) + xfs_dir2_leaf_compact(args, drop_blk->bp); + if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT)) + xfs_dir2_leaf_compact(args, save_blk->bp); + /* + * Move the entries from drop to the appropriate end of save. + */ + drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) + xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, + INT_GET(drop_leaf->hdr.count, ARCH_CONVERT)); + else + xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, + INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT)); + save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT); + xfs_dir2_leafn_check(args->dp, save_blk->bp); +} + +/* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + xfs_dir2_trace_args("node_addname", args); + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!args->justcheck) + xfs_da_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int /* error */ +xfs_dir2_node_addname_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_da_state_blk_t *fblk) /* optional freespace block */ +{ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_db_t dbno; /* data block number */ + xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ + int error; /* error return value */ + xfs_dir2_db_t fbno; /* freespace block number */ + xfs_dabuf_t *fbp; /* freespace buffer */ + int findex; /* freespace entry index */ + xfs_dir2_free_t *free=NULL; /* freespace block structure */ + xfs_dir2_db_t ifbno; /* initial freespace block no */ + xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ + int length; /* length of the new entry */ + int logfree; /* need to log free entry */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_dir2_data_off_t *tagp; /* data entry tag pointer */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + /* + * If we came in with a freespace block that means that lookup + * found an entry with our hash value. This is the freespace + * block for that data entry. + */ + if (fblk) { + fbp = fblk->bp; + /* + * Remember initial freespace block number. + */ + ifbno = fblk->blkno; + free = fbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + findex = fblk->index; + /* + * This means the free entry showed that the data block had + * space for our entry, so we remembered it. + * Use that data block. + */ + if (findex >= 0) { + ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT)); + ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF); + ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length); + dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex; + } + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ + else { + dbno = -1; + findex = 0; + } + } + /* + * Didn't come in with a freespace block, so don't have a data block. + */ + else { + ifbno = dbno = -1; + fbp = NULL; + findex = 0; + } + /* + * If we don't have a data block yet, we're going to scan the + * freespace blocks looking for one. Figure out what the + * highest freespace block number is. + */ + if (dbno == -1) { + xfs_fileoff_t fo; /* freespace block number */ + + if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + return error; + lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo); + fbno = ifbno; + } + /* + * While we haven't identified a data block, search the freeblock + * data for a good data block. If we find a null freeblock entry, + * indicating a hole in the data blocks, remember that. + */ + while (dbno == -1) { + /* + * If we don't have a freeblock in hand, get the next one. + */ + if (fbp == NULL) { + /* + * Happens the first time through unless lookup gave + * us a freespace block to start with. + */ + if (++fbno == 0) + fbno = XFS_DIR2_FREE_FIRSTDB(mp); + /* + * If it's ifbno we already looked at it. + */ + if (fbno == ifbno) + fbno++; + /* + * If it's off the end we're done. + */ + if (fbno >= lastfbno) + break; + /* + * Read the block. There can be holes in the + * freespace blocks, so this might not succeed. + * This should be really rare, so there's no reason + * to avoid it. + */ + if ((error = xfs_da_read_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + XFS_DATA_FORK))) { + return error; + } + if (unlikely(fbp == NULL)) { + continue; + } + free = fbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + findex = 0; + } + /* + * Look at the current free entry. Is it good enough? + */ + if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF && + INT_GET(free->bests[findex], ARCH_CONVERT) >= length) + dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex; + else { + /* + * Are we done with the freeblock? + */ + if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) { + /* + * Drop the block. + */ + xfs_da_brelse(tp, fbp); + fbp = NULL; + if (fblk && fblk->bp) + fblk->bp = NULL; + } + } + } + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (unlikely(dbno == -1)) { + /* + * Not allowed to allocate, return failure. + */ + if (args->justcheck || args->total == 0) { + /* + * Drop the freespace buffer unless it came from our + * caller. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return XFS_ERROR(ENOSPC); + } + /* + * Allocate and initialize the new data block. + */ + if (unlikely((error = xfs_dir2_grow_inode(args, + XFS_DIR2_DATA_SPACE, + &dbno)) || + (error = xfs_dir2_data_init(args, dbno, &dbp)))) { + /* + * Drop the freespace buffer unless it came from our + * caller. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return error; + } + /* + * If (somehow) we have a freespace block, get rid of it. + */ + if (fbp) + xfs_da_brelse(tp, fbp); + if (fblk && fblk->bp) + fblk->bp = NULL; + + /* + * Get the freespace block corresponding to the data block + * that was just allocated. + */ + fbno = XFS_DIR2_DB_TO_FDB(mp, dbno); + if (unlikely(error = xfs_da_read_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, + XFS_DATA_FORK))) { + xfs_da_buf_done(dbp); + return error; + } + /* + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. + */ + if( fbp == NULL ) { + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno))) { + return error; + } + + if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) { + cmn_err(CE_ALERT, + "xfs_dir2_node_addname_int: dir ino " + "%llu needed freesp block %lld for\n" + " data block %lld, got %lld\n" + " ifbno %llu lastfbno %d\n", + (unsigned long long)dp->i_ino, + (long long)XFS_DIR2_DB_TO_FDB(mp, dbno), + (long long)dbno, (long long)fbno, + (unsigned long long)ifbno, lastfbno); + if (fblk) { + cmn_err(CE_ALERT, + " fblk 0x%p blkno %llu " + "index %d magic 0x%x\n", + fblk, + (unsigned long long)fblk->blkno, + fblk->index, + fblk->magic); + } else { + cmn_err(CE_ALERT, + " ... fblk is NULL\n"); + } + XFS_ERROR_REPORT("xfs_dir2_node_addname_int", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Get a buffer for the new block. + */ + if ((error = xfs_da_get_buf(tp, dp, + XFS_DIR2_DB_TO_DA(mp, fbno), + -1, &fbp, XFS_DATA_FORK))) { + return error; + } + ASSERT(fbp != NULL); + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + free = fbp->data; + INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC); + INT_SET(free->hdr.firstdb, ARCH_CONVERT, + (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * + XFS_DIR2_MAX_FREE_BESTS(mp)); + free->hdr.nvalid = 0; + free->hdr.nused = 0; + } else { + free = fbp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + } + + /* + * Set the freespace block index from the data block number. + */ + findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno); + /* + * If it's after the end of the current entries in the + * freespace block, extend that table. + */ + if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) { + ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp)); + INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1); + /* + * Tag new entry so nused will go up. + */ + INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF); + } + /* + * If this entry was for an empty data block + * (this should always be true) then update the header. + */ + if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) { + INT_MOD(free->hdr.nused, ARCH_CONVERT, +1); + xfs_dir2_free_log_header(tp, fbp); + } + /* + * Update the real value in the table. + * We haven't allocated the data entry yet so this will + * change again. + */ + data = dbp->data; + INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT); + logfree = 1; + } + /* + * We had a data block so we don't have to make a new one. + */ + else { + /* + * If just checking, we succeeded. + */ + if (args->justcheck) { + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return 0; + } + /* + * Read the data block in. + */ + if (unlikely( + error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno), + -1, &dbp, XFS_DATA_FORK))) { + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + return error; + } + data = dbp->data; + logfree = 0; + } + ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length); + /* + * Point to the existing unused space. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT)); + needscan = needlog = 0; + /* + * Mark the first part of the unused space, inuse for us. + */ + xfs_dir2_data_use_free(tp, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + &needlog, &needscan); + /* + * Fill in the new entry and log it. + */ + dep = (xfs_dir2_data_entry_t *)dup; + INT_SET(dep->inumber, ARCH_CONVERT, args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); + INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data)); + xfs_dir2_data_log_entry(tp, dbp, dep); + /* + * Rescan the block for bestfree if needed. + */ + if (needscan) + xfs_dir2_data_freescan(mp, data, &needlog, NULL); + /* + * Log the data block header if needed. + */ + if (needlog) + xfs_dir2_data_log_header(tp, dbp); + /* + * If the freespace entry is now wrong, update it. + */ + if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) { + INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT); + logfree = 1; + } + /* + * Log the freespace entry if needed. + */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + /* + * If the caller didn't hand us the freespace block, drop it. + */ + if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) + xfs_da_buf_done(fbp); + /* + * Return the data block and offset in args, then drop the data block. + */ + args->blkno = (xfs_dablk_t)dbno; + args->index = INT_GET(*tagp, ARCH_CONVERT); + xfs_da_buf_done(dbp); + return 0; +} + +/* + * Lookup an entry in a node-format directory. + * All the real work happens in xfs_da_node_lookup_int. + * The only real output is the inode number of the entry. + */ +int /* error */ +xfs_dir2_node_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + int error; /* error return value */ + int i; /* btree level */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + xfs_dir2_trace_args("node_lookup", args); + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Fill in the path to the entry in the cursor. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) + rval = error; + /* + * Release the btree blocks and leaf block. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + /* + * Release the data block if we have it. + */ + if (state->extravalid && state->extrablk.bp) { + xfs_da_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Remove an entry from a node-format directory. + */ +int /* error */ +xfs_dir2_node_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + int error; /* error return value */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + xfs_dir2_trace_args("node_removename", args); + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + /* + * Look up the entry we're deleting, set up the cursor. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) { + rval = error; + } + /* + * Didn't find it, upper layer screwed up. + */ + if (rval != EEXIST) { + xfs_da_state_free(state); + return rval; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(state->extravalid); + /* + * Remove the leaf and data entries. + * Extrablk refers to the data block. + */ + error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, + &state->extrablk, &rval); + if (error) { + return error; + } + /* + * Fix the hash values up the btree. + */ + xfs_da_fixhashpath(state, &state->path); + /* + * If we need to join leaf blocks, do it. + */ + if (rval && state->path.active > 1) + error = xfs_da_join(state); + /* + * If no errors so far, try conversion to leaf format. + */ + if (!error) + error = xfs_dir2_node_to_leaf(state); + xfs_da_state_free(state); + return error; +} + +/* + * Replace an entry's inode number in a node-format directory. + */ +int /* error */ +xfs_dir2_node_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_entry_t *dep; /* data entry changed */ + int error; /* error return value */ + int i; /* btree level */ + xfs_ino_t inum; /* new inode number */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ + int rval; /* internal return value */ + xfs_da_state_t *state; /* btree cursor */ + + xfs_dir2_trace_args("node_replace", args); + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + state->blocksize = state->mp->m_dirblksize; + state->node_ents = state->mp->m_dir_node_ents; + inum = args->inumber; + /* + * Lookup the entry to change in the btree. + */ + error = xfs_da_node_lookup_int(state, &rval); + if (error) { + rval = error; + } + /* + * It should be found, since the vnodeops layer has looked it up + * and locked it. But paranoia is good. + */ + if (rval == EEXIST) { + /* + * Find the leaf entry. + */ + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + leaf = blk->bp->data; + lep = &leaf->ents[blk->index]; + ASSERT(state->extravalid); + /* + * Point to the data entry. + */ + data = state->extrablk.bp->data; + ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC); + dep = (xfs_dir2_data_entry_t *) + ((char *)data + + XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT))); + ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT)); + /* + * Fill in the new inode number and log the entry. + */ + INT_SET(dep->inumber, ARCH_CONVERT, inum); + xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); + rval = 0; + } + /* + * Didn't find it, and we're holding a data block. Drop it. + */ + else if (state->extravalid) { + xfs_da_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + /* + * Release all the buffers in the cursor. + */ + for (i = 0; i < state->path.active; i++) { + xfs_da_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Trim off a trailing empty freespace block. + * Return (in rvalp) 1 if we did it, 0 if not. + */ +int /* error */ +xfs_dir2_node_trim_free( + xfs_da_args_t *args, /* operation arguments */ + xfs_fileoff_t fo, /* free block number */ + int *rvalp) /* out: did something */ +{ + xfs_dabuf_t *bp; /* freespace buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Read the freespace block. + */ + if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, + XFS_DATA_FORK))) { + return error; + } + + /* + * There can be holes in freespace. If fo is a hole, there's + * nothing to do. + */ + if (bp == NULL) { + return 0; + } + free = bp->data; + ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC); + /* + * If there are used entries, there's nothing to do. + */ + if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) { + xfs_da_brelse(tp, bp); + *rvalp = 0; + return 0; + } + /* + * Blow the block away. + */ + if ((error = + xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo), + bp))) { + /* + * Can't fail with ENOSPC since that only happens with no + * space reservation, when breaking up an extent into two + * pieces. This is the last block of an extent. + */ + ASSERT(error != ENOSPC); + xfs_da_brelse(tp, bp); + return error; + } + /* + * Return that we succeeded. + */ + *rvalp = 1; + return 0; +} diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h new file mode 100644 index 00000000000..96db420c7c5 --- /dev/null +++ b/fs/xfs/xfs_dir2_node.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_NODE_H__ +#define __XFS_DIR2_NODE_H__ + +/* + * Directory version 2, btree node format structures + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/* + * Constants. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) +#define XFS_DIR2_FREE_FIRSTDB(mp) \ + XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET) + +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ + +/* + * Structures. + */ +typedef struct xfs_dir2_free_hdr { + __uint32_t magic; /* XFS_DIR2_FREE_MAGIC */ + __int32_t firstdb; /* db of first entry */ + __int32_t nvalid; /* count of valid entries */ + __int32_t nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + xfs_dir2_data_off_t bests[1]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; +#define XFS_DIR2_MAX_FREE_BESTS(mp) \ + (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \ + (uint)sizeof(xfs_dir2_data_off_t)) + +/* + * Macros. + */ + +/* + * Convert data space db to the corresponding free db. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDB) +xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db); +#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db) +#else +#define XFS_DIR2_DB_TO_FDB(mp,db) \ + (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp)) +#endif + +/* + * Convert data space db to the corresponding index in a free db. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDINDEX) +int +xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db); +#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db) +#else +#define XFS_DIR2_DB_TO_FDINDEX(mp,db) ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)) +#endif + +/* + * Functions. + */ + +extern void + xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); + +extern int + xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp); + +extern xfs_dahash_t + xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); + +extern int + xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); + +extern int + xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); + +extern int + xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); + +extern int + xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); + +extern void + xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); + +extern int + xfs_dir2_node_addname(struct xfs_da_args *args); + +extern int + xfs_dir2_node_lookup(struct xfs_da_args *args); + +extern int + xfs_dir2_node_removename(struct xfs_da_args *args); + +extern int + xfs_dir2_node_replace(struct xfs_da_args *args); + +extern int + xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); + +#endif /* __XFS_DIR2_NODE_H__ */ diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c new file mode 100644 index 00000000000..6bbc6167441 --- /dev/null +++ b/fs/xfs/xfs_dir2_sf.c @@ -0,0 +1,1317 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_sf.c + * Shortform directory implementation for v2 directories. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_error.h" +#include "xfs_dir2_data.h" +#include "xfs_dir2_leaf.h" +#include "xfs_dir2_block.h" +#include "xfs_dir2_trace.h" + +/* + * Prototypes for internal functions. + */ +static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args, + xfs_dir2_sf_entry_t *sfep, + xfs_dir2_data_aoff_t offset, + int new_isize); +static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange, + int new_isize); +static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange, + xfs_dir2_sf_entry_t **sfepp, + xfs_dir2_data_aoff_t *offsetp); +#ifdef DEBUG +static void xfs_dir2_sf_check(xfs_da_args_t *args); +#else +#define xfs_dir2_sf_check(args) +#endif /* DEBUG */ +#if XFS_BIG_INUMS +static void xfs_dir2_sf_toino4(xfs_da_args_t *args); +static void xfs_dir2_sf_toino8(xfs_da_args_t *args); +#endif /* XFS_BIG_INUMS */ + +/* + * Given a block directory (dp/block), calculate its size as a shortform (sf) + * directory and a header for the sf directory, if it will fit it the + * space currently present in the inode. If it won't fit, the output + * size is too big (but not accurate). + */ +int /* size for sf form */ +xfs_dir2_block_sfsize( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dir2_block_t *block, /* block directory data */ + xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */ + xfs_dir2_block_tail_t *btp; /* tail area of the block */ + int count; /* shortform entry count */ + xfs_dir2_data_entry_t *dep; /* data entry in the block */ + int i; /* block entry index */ + int i8count; /* count of big-inode entries */ + int isdot; /* entry is "." */ + int isdotdot; /* entry is ".." */ + xfs_mount_t *mp; /* mount structure pointer */ + int namelen; /* total name bytes */ + xfs_ino_t parent; /* parent inode number */ + int size=0; /* total computed size */ + + mp = dp->i_mount; + + count = i8count = namelen = 0; + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + blp = XFS_DIR2_BLOCK_LEAF_P(btp); + + /* + * Iterate over the block's data entries by using the leaf pointers. + */ + for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) { + if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Calculate the pointer to the entry at hand. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + /* + * Detect . and .., so we can special-case them. + * . is not included in sf directories. + * .. is included by just the parent inode number. + */ + isdot = dep->namelen == 1 && dep->name[0] == '.'; + isdotdot = + dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.'; +#if XFS_BIG_INUMS + if (!isdot) + i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM; +#endif + if (!isdot && !isdotdot) { + count++; + namelen += dep->namelen; + } else if (isdotdot) + parent = INT_GET(dep->inumber, ARCH_CONVERT); + /* + * Calculate the new size, see if we should give up yet. + */ + size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */ + count + /* namelen */ + count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ + namelen + /* name */ + (i8count ? /* inumber */ + (uint)sizeof(xfs_dir2_ino8_t) * count : + (uint)sizeof(xfs_dir2_ino4_t) * count); + if (size > XFS_IFORK_DSIZE(dp)) + return size; /* size value is a failure */ + } + /* + * Create the output header, if it worked. + */ + sfhp->count = count; + sfhp->i8count = i8count; + XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + return size; +} + +/* + * Convert a block format directory to shortform. + * Caller has already checked that it will fit, and built us a header. + */ +int /* error */ +xfs_dir2_block_to_sf( + xfs_da_args_t *args, /* operation arguments */ + xfs_dabuf_t *bp, /* block buffer */ + int size, /* shortform directory size */ + xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ +{ + xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data pointer */ + char *endptr; /* end of data entries */ + int error; /* error return value */ + int logflags; /* inode logging flags */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data pointer */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_ino_t temp; + + xfs_dir2_trace_args_sb("block_to_sf", args, size, bp); + dp = args->dp; + mp = dp->i_mount; + + /* + * Make a copy of the block data, so we can shrink the inode + * and add local data. + */ + block = kmem_alloc(mp->m_dirblksize, KM_SLEEP); + memcpy(block, bp->data, mp->m_dirblksize); + logflags = XFS_ILOG_CORE; + if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { + ASSERT(error != ENOSPC); + goto out; + } + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + logflags |= XFS_ILOG_DDATA; + /* + * Copy the header into the newly allocate local space. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count)); + dp->i_d.di_size = size; + /* + * Set up to loop over the block's entries. + */ + btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); + ptr = (char *)block->u; + endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + /* + * Loop over the active and unused entries. + * Stop when we reach the leaf/tail portion of the block. + */ + while (ptr < endptr) { + /* + * If it's unused, just skip over it. + */ + dup = (xfs_dir2_data_unused_t *)ptr; + if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) { + ptr += INT_GET(dup->length, ARCH_CONVERT); + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + /* + * Skip . + */ + if (dep->namelen == 1 && dep->name[0] == '.') + ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino); + /* + * Skip .., but make sure the inode number is right. + */ + else if (dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.') + ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == + XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + /* + * Normal entry, copy it into shortform. + */ + else { + sfep->namelen = dep->namelen; + XFS_DIR2_SF_PUT_OFFSET(sfep, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)block)); + memcpy(sfep->name, dep->name, dep->namelen); + temp=INT_GET(dep->inumber, ARCH_CONVERT); + XFS_DIR2_SF_PUT_INUMBER(sfp, &temp, + XFS_DIR2_SF_INUMBERP(sfep)); + sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + } + ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + } + ASSERT((char *)sfep - (char *)sfp == size); + xfs_dir2_sf_check(args); +out: + xfs_trans_log_inode(args->trans, dp, logflags); + kmem_free(block, mp->m_dirblksize); + return error; +} + +/* + * Add a name to a shortform directory. + * There are two algorithms, "easy" and "hard" which we decide on + * before changing anything. + * Convert to block form if necessary, if the new entry won't fit. + */ +int /* error */ +xfs_dir2_sf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + int add_entsize; /* size of the new entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int incr_isize; /* total change in size */ + int new_isize; /* di_size after adding name */ + int objchange; /* changing to 8-byte inodes */ + xfs_dir2_data_aoff_t offset; /* offset for new entry */ + int old_isize; /* di_size before adding name */ + int pick; /* which algorithm to use */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + + xfs_dir2_trace_args("sf_addname", args); + ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Make sure the shortform value has some of its header. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + /* + * Compute entry (and change in) size. + */ + add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + incr_isize = add_entsize; + objchange = 0; +#if XFS_BIG_INUMS + /* + * Do we have to change to 8 byte inodes? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + /* + * Yes, adjust the entry size and the total size. + */ + add_entsize += + (uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t); + incr_isize += + (sfp->hdr.count + 2) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + objchange = 1; + } +#endif + old_isize = (int)dp->i_d.di_size; + new_isize = old_isize + incr_isize; + /* + * Won't fit as shortform any more (due to size), + * or the pick routine says it won't (due to offset values). + */ + if (new_isize > XFS_IFORK_DSIZE(dp) || + (pick = + xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { + /* + * Just checking or no space reservation, it doesn't fit. + */ + if (args->justcheck || args->total == 0) + return XFS_ERROR(ENOSPC); + /* + * Convert to block form then add the name. + */ + error = xfs_dir2_sf_to_block(args); + if (error) + return error; + return xfs_dir2_block_addname(args); + } + /* + * Just checking, it fits. + */ + if (args->justcheck) + return 0; + /* + * Do it the easy way - just add it at the end. + */ + if (pick == 1) + xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize); + /* + * Do it the hard way - look for a place to insert the new entry. + * Convert to 8 byte inode numbers first if necessary. + */ + else { + ASSERT(pick == 2); +#if XFS_BIG_INUMS + if (objchange) + xfs_dir2_sf_toino8(args); +#endif + xfs_dir2_sf_addname_hard(args, objchange, new_isize); + } + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Add the new entry the "easy" way. + * This is copying the old directory and adding the new entry at the end. + * Since it's sorted by "offset" we need room after the last offset + * that's already there, and then room to convert to a block directory. + * This is already checked by the pick routine. + */ +static void +xfs_dir2_sf_addname_easy( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */ + xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ + int new_isize) /* new directory size */ +{ + int byteoff; /* byte offset in sf dir */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + byteoff = (int)((char *)sfep - (char *)sfp); + /* + * Grow the in-inode space. + */ + xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen), + XFS_DATA_FORK); + /* + * Need to set up again due to realloc of the inode data. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); + /* + * Fill in the new entry. + */ + sfep->namelen = args->namelen; + XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, + XFS_DIR2_SF_INUMBERP(sfep)); + /* + * Update the header and inode. + */ + sfp->hdr.count++; +#if XFS_BIG_INUMS + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) + sfp->hdr.i8count++; +#endif + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Add the new entry the "hard" way. + * The caller has already converted to 8 byte inode numbers if necessary, + * in which case we need to leave the i8count at 1. + * Find a hole that the new entry will fit into, and copy + * the first part of the entries, the new entry, and the last part of + * the entries. + */ +/* ARGSUSED */ +static void +xfs_dir2_sf_addname_hard( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* changing inode number size */ + int new_isize) /* new directory size */ +{ + int add_datasize; /* data size need for new ent */ + char *buf; /* buffer for old */ + xfs_inode_t *dp; /* incore directory inode */ + int eof; /* reached end of old dir */ + int nbytes; /* temp for byte copies */ + xfs_dir2_data_aoff_t new_offset; /* next offset value */ + xfs_dir2_data_aoff_t offset; /* current offset value */ + int old_isize; /* previous di_size */ + xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ + xfs_dir2_sf_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ + xfs_dir2_sf_t *sfp; /* new shortform dir */ + + /* + * Copy the old directory to the stack buffer. + */ + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + old_isize = (int)dp->i_d.di_size; + buf = kmem_alloc(old_isize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)buf; + memcpy(oldsfp, sfp, old_isize); + /* + * Loop over the old directory finding the place we're going + * to insert the new entry. + * If it's going to end up at the end then oldsfep will point there. + */ + for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp), + add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen), + eof = (char *)oldsfep == &buf[old_isize]; + !eof; + offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen), + oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep), + eof = (char *)oldsfep == &buf[old_isize]) { + new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep); + if (offset + add_datasize <= new_offset) + break; + } + /* + * Get rid of the old directory, then allocate space for + * the new one. We do this so xfs_idata_realloc won't copy + * the data. + */ + xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); + xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* + * Reset the pointer since the buffer was reallocated. + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Copy the first part of the directory, including the header. + */ + nbytes = (int)((char *)oldsfep - (char *)oldsfp); + memcpy(sfp, oldsfp, nbytes); + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes); + /* + * Fill in the new entry, and update the header counts. + */ + sfep->namelen = args->namelen; + XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, + XFS_DIR2_SF_INUMBERP(sfep)); + sfp->hdr.count++; +#if XFS_BIG_INUMS + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) + sfp->hdr.i8count++; +#endif + /* + * If there's more left to copy, do that. + */ + if (!eof) { + sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + memcpy(sfep, oldsfep, old_isize - nbytes); + } + kmem_free(buf, old_isize); + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Decide if the new entry will fit at all. + * If it will fit, pick between adding the new entry to the end (easy) + * or somewhere else (hard). + * Return 0 (won't fit), 1 (easy), 2 (hard). + */ +/*ARGSUSED*/ +static int /* pick result */ +xfs_dir2_sf_addname_pick( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* inode # size changes */ + xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ + xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int holefit; /* found hole it will fit in */ + int i; /* entry number */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_aoff_t offset; /* data block offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + int size; /* entry's data size */ + int used; /* data bytes used */ + + dp = args->dp; + mp = dp->i_mount; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + size = XFS_DIR2_DATA_ENTSIZE(args->namelen); + offset = XFS_DIR2_DATA_FIRST_OFFSET; + sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + holefit = 0; + /* + * Loop over sf entries. + * Keep track of data offset and whether we've seen a place + * to insert the new entry. + */ + for (i = 0; i < sfp->hdr.count; i++) { + if (!holefit) + holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep); + offset = XFS_DIR2_SF_GET_OFFSET(sfep) + + XFS_DIR2_DATA_ENTSIZE(sfep->namelen); + sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + } + /* + * Calculate data bytes used excluding the new entry, if this + * was a data block (block form directory). + */ + used = offset + + (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t); + /* + * If it won't fit in a block form then we can't insert it, + * we'll go back, convert to block, then try the insert and convert + * to leaf. + */ + if (used + (holefit ? 0 : size) > mp->m_dirblksize) + return 0; + /* + * If changing the inode number size, do it the hard way. + */ +#if XFS_BIG_INUMS + if (objchange) { + return 2; + } +#else + ASSERT(objchange == 0); +#endif + /* + * If it won't fit at the end then do it the hard way (use the hole). + */ + if (used + size > mp->m_dirblksize) + return 2; + /* + * Do it the easy way. + */ + *sfepp = sfep; + *offsetp = offset; + return 1; +} + +#ifdef DEBUG +/* + * Check consistency of shortform directory, assert if bad. + */ +static void +xfs_dir2_sf_check( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry number */ + int i8count; /* number of big inode#s */ + xfs_ino_t ino; /* entry inode number */ + int offset; /* data offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + offset = XFS_DIR2_DATA_FIRST_OFFSET; + ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset); + ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + offset = + XFS_DIR2_SF_GET_OFFSET(sfep) + + XFS_DIR2_DATA_ENTSIZE(sfep->namelen); + } + ASSERT(i8count == sfp->hdr.i8count); + ASSERT(XFS_BIG_INUMS || i8count == 0); + ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT(offset + + (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= + dp->i_mount->m_dirblksize); +} +#endif /* DEBUG */ + +/* + * Create a new (shortform) directory. + */ +int /* error, always 0 */ +xfs_dir2_sf_create( + xfs_da_args_t *args, /* operation arguments */ + xfs_ino_t pino) /* parent inode number */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i8count; /* parent inode is an 8-byte number */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + int size; /* directory size */ + + xfs_dir2_trace_args_i("sf_create", args, pino); + dp = args->dp; + + ASSERT(dp != NULL); + ASSERT(dp->i_d.di_size == 0); + /* + * If it's currently a zero-length extent file, + * convert it to local format. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + dp->i_df.if_flags |= XFS_IFINLINE; + } + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_bytes == 0); + i8count = pino > XFS_DIR2_MAX_SHORT_INUM; + size = XFS_DIR2_SF_HDR_SIZE(i8count); + /* + * Make a buffer for the data. + */ + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + /* + * Fill in the header, + */ + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp->hdr.i8count = i8count; + /* + * Now can put in the inode number, since i8count is set. + */ + XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent); + sfp->hdr.count = 0; + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +int /* error */ +xfs_dir2_sf_getdents( + xfs_inode_t *dp, /* incore directory inode */ + uio_t *uio, /* caller's buffer control */ + int *eofp, /* eof reached? (out) */ + xfs_dirent_t *dbp, /* caller's buffer */ + xfs_dir2_put_t put) /* abi's formatting function */ +{ + int error; /* error return value */ + int i; /* shortform entry number */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_put_args_t p; /* arg package for put rtn */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_off_t dir_offset; + + mp = dp->i_mount; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + + dir_offset = uio->uio_offset; + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) { + *eofp = 1; + return 0; + } + + /* + * Set up putargs structure. + */ + p.dbp = dbp; + p.put = put; + p.uio = uio; + /* + * Put . entry unless we're starting past it. + */ + if (dir_offset <= + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOT_OFFSET)) { + p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0, + XFS_DIR2_DATA_DOTDOT_OFFSET); + p.ino = dp->i_ino; +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = "."; + p.namelen = 1; + + error = p.put(&p); + + if (!p.done) { + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOT_OFFSET); + return error; + } + } + + /* + * Put .. entry unless we're starting past it. + */ + if (dir_offset <= + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOTDOT_OFFSET)) { + p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_FIRST_OFFSET); + p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = ".."; + p.namelen = 2; + + error = p.put(&p); + + if (!p.done) { + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_DATA_DOTDOT_OFFSET); + return error; + } + } + + /* + * Loop while there are more entries and put'ing works. + */ + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + + off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_SF_GET_OFFSET(sfep)); + + if (dir_offset > off) + continue; + + p.namelen = sfep->namelen; + + p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, + XFS_DIR2_SF_GET_OFFSET(sfep) + + XFS_DIR2_DATA_ENTSIZE(p.namelen)); + + p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = (char *)sfep->name; + + error = p.put(&p); + + if (!p.done) { + uio->uio_offset = off; + return error; + } + } + + /* + * They all fit. + */ + *eofp = 1; + + uio->uio_offset = + XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); + + return 0; +} + +/* + * Lookup an entry in a shortform directory. + * Returns EEXIST if found, ENOENT if not found. + */ +int /* error */ +xfs_dir2_sf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + xfs_dir2_trace_args("sf_lookup", args); + xfs_dir2_sf_check(args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + /* + * Special case for . + */ + if (args->namelen == 1 && args->name[0] == '.') { + args->inumber = dp->i_ino; + return XFS_ERROR(EEXIST); + } + /* + * Special case for .. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + return XFS_ERROR(EEXIST); + } + /* + * Loop over all the entries trying to match ours. + */ + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + if (sfep->namelen == args->namelen && + sfep->name[0] == args->name[0] && + memcmp(args->name, sfep->name, args->namelen) == 0) { + args->inumber = + XFS_DIR2_SF_GET_INUMBER(sfp, + XFS_DIR2_SF_INUMBERP(sfep)); + return XFS_ERROR(EEXIST); + } + } + /* + * Didn't find it. + */ + ASSERT(args->oknoent); + return XFS_ERROR(ENOENT); +} + +/* + * Remove an entry from a shortform directory. + */ +int /* error */ +xfs_dir2_sf_removename( + xfs_da_args_t *args) +{ + int byteoff; /* offset of removed entry */ + xfs_inode_t *dp; /* incore directory inode */ + int entsize; /* this entry's size */ + int i; /* shortform entry index */ + int newsize; /* new inode size */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + xfs_dir2_trace_args("sf_removename", args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + oldsize = (int)dp->i_d.di_size; + /* + * Bail out if the directory is way too short. + */ + if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == oldsize); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + /* + * Loop over the old directory entries. + * Find the one we're deleting. + */ + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + if (sfep->namelen == args->namelen && + sfep->name[0] == args->name[0] && + memcmp(sfep->name, args->name, args->namelen) == 0) { + ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp, + XFS_DIR2_SF_INUMBERP(sfep)) == + args->inumber); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->hdr.count) { + return XFS_ERROR(ENOENT); + } + /* + * Calculate sizes. + */ + byteoff = (int)((char *)sfep - (char *)sfp); + entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + newsize = oldsize - entsize; + /* + * Copy the part if any after the removed entry, sliding it down. + */ + if (byteoff + entsize < oldsize) + memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize, + oldsize - (byteoff + entsize)); + /* + * Fix up the header and file size. + */ + sfp->hdr.count--; + dp->i_d.di_size = newsize; + /* + * Reallocate, making it smaller. + */ + xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; +#if XFS_BIG_INUMS + /* + * Are we changing inode number size? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + if (sfp->hdr.i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->hdr.i8count--; + } +#endif + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Replace the inode number of an entry in a shortform directory. + */ +int /* error */ +xfs_dir2_sf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ +#if XFS_BIG_INUMS || defined(DEBUG) + xfs_ino_t ino=0; /* entry old inode number */ +#endif +#if XFS_BIG_INUMS + int i8elevated; /* sf_toino8 set i8count=1 */ +#endif + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_t *sfp; /* shortform structure */ + + xfs_dir2_trace_args("sf_replace", args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the shortform directory is way too small. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); +#if XFS_BIG_INUMS + /* + * New inode number is large, and need to convert to 8-byte inodes. + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + int error; /* error return value */ + int newsize; /* new inode size */ + + newsize = + dp->i_df.if_bytes + + (sfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + /* + * Won't fit as shortform, convert to block then do replace. + */ + if (newsize > XFS_IFORK_DSIZE(dp)) { + error = xfs_dir2_sf_to_block(args); + if (error) { + return error; + } + return xfs_dir2_block_replace(args); + } + /* + * Still fits, convert to 8-byte now. + */ + xfs_dir2_sf_toino8(args); + i8elevated = 1; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + } else + i8elevated = 0; +#endif + ASSERT(args->namelen != 1 || args->name[0] != '.'); + /* + * Replace ..'s entry. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { +#if XFS_BIG_INUMS || defined(DEBUG) + ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ASSERT(args->inumber != ino); +#endif + XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent); + } + /* + * Normal entry, look for the name. + */ + else { + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { + if (sfep->namelen == args->namelen && + sfep->name[0] == args->name[0] && + memcmp(args->name, sfep->name, args->namelen) == 0) { +#if XFS_BIG_INUMS || defined(DEBUG) + ino = XFS_DIR2_SF_GET_INUMBER(sfp, + XFS_DIR2_SF_INUMBERP(sfep)); + ASSERT(args->inumber != ino); +#endif + XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, + XFS_DIR2_SF_INUMBERP(sfep)); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->hdr.count) { + ASSERT(args->oknoent); +#if XFS_BIG_INUMS + if (i8elevated) + xfs_dir2_sf_toino4(args); +#endif + return XFS_ERROR(ENOENT); + } + } +#if XFS_BIG_INUMS + /* + * See if the old number was large, the new number is small. + */ + if (ino > XFS_DIR2_MAX_SHORT_INUM && + args->inumber <= XFS_DIR2_MAX_SHORT_INUM) { + /* + * And the old count was one, so need to convert to small. + */ + if (sfp->hdr.i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->hdr.i8count--; + } + /* + * See if the old number was small, the new number is large. + */ + if (ino <= XFS_DIR2_MAX_SHORT_INUM && + args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + /* + * add to the i8count unless we just converted to 8-byte + * inodes (which does an implied i8count = 1) + */ + ASSERT(sfp->hdr.i8count != 0); + if (!i8elevated) + sfp->hdr.i8count++; + } +#endif + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return 0; +} + +#if XFS_BIG_INUMS +/* + * Convert from 8-byte inode numbers to 4-byte inode numbers. + * The last 8-byte inode number is gone, but the count is still 1. + */ +static void +xfs_dir2_sf_toino4( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino; /* entry inode number */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_t *sfp; /* new sf directory */ + + xfs_dir2_trace_args("sf_toino4", args); + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->hdr.i8count == 1); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize - + (oldsfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_t *)buf; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->hdr.count = oldsfp->hdr.count; + sfp->hdr.i8count = 0; + ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); + XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), + oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), + oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, + XFS_DIR2_SF_INUMBERP(oldsfep)); + XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf, oldsize); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} + +/* + * Convert from 4-byte inode numbers to 8-byte inode numbers. + * The new 8-byte inode number is not there yet, we leave with the + * count 1 but no corresponding entry. + */ +static void +xfs_dir2_sf_toino8( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino; /* entry inode number */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_t *sfp; /* new sf directory */ + + xfs_dir2_trace_args("sf_toino8", args); + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->hdr.i8count == 0); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize + + (oldsfp->hdr.count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_t *)buf; + sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->hdr.count = oldsfp->hdr.count; + sfp->hdr.i8count = 1; + ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); + XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), + oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); + i < sfp->hdr.count; + i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), + oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, + XFS_DIR2_SF_INUMBERP(oldsfep)); + XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf, oldsize); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} +#endif /* XFS_BIG_INUMS */ diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h new file mode 100644 index 00000000000..bac6f5a2a31 --- /dev/null +++ b/fs/xfs/xfs_dir2_sf.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_SF_H__ +#define __XFS_DIR2_SF_H__ + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to + * fit into the literal area of the inode. + */ + +struct uio; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_dir2_block; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Maximum size of a shortform directory. + */ +#define XFS_DIR2_SF_MAX_SIZE \ + (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \ + (uint)sizeof(xfs_agino_t)) + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t; + +/* + * The parent directory has a dedicated field, and the self-pointer must + * be calculated on the fly. + * + * Entries are packed toward the top as tightly as possible. The header + * and the elements must be memcpy'd out into a work area to get correct + * alignment for the inode number fields. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __uint8_t namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __uint8_t name[1]; /* name, variable size */ + xfs_dir2_inou_t inumber; /* inode number, var. offset */ +} xfs_dir2_sf_entry_t; + +typedef struct xfs_dir2_sf { + xfs_dir2_sf_hdr_t hdr; /* shortform header */ + xfs_dir2_sf_entry_t list[1]; /* shortform entries */ +} xfs_dir2_sf_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_HDR_SIZE) +int xfs_dir2_sf_hdr_size(int i8count); +#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count) +#else +#define XFS_DIR2_SF_HDR_SIZE(i8count) \ + ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ + ((i8count) == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_INUMBERP) +xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep); +#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep) +#else +#define XFS_DIR2_SF_INUMBERP(sfep) \ + ((xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_INUMBER) +xfs_intino_t xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from); +#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \ + xfs_dir2_sf_get_inumber(sfp, from) + +#else +#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \ + ((sfp)->hdr.i8count == 0 ? \ + (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \ + (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_INUMBER) +void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, + xfs_dir2_inou_t *to); +#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \ + xfs_dir2_sf_put_inumber(sfp,from,to) +#else +#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \ + if ((sfp)->hdr.i8count == 0) { \ + XFS_PUT_DIR_INO4(*(from), (to)->i4); \ + } else { \ + XFS_PUT_DIR_INO8(*(from), (to)->i8); \ + } +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_OFFSET) +xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep); +#define XFS_DIR2_SF_GET_OFFSET(sfep) \ + xfs_dir2_sf_get_offset(sfep) +#else +#define XFS_DIR2_SF_GET_OFFSET(sfep) \ + INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_OFFSET) +void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, + xfs_dir2_data_aoff_t off); +#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \ + xfs_dir2_sf_put_offset(sfep,off) +#else +#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \ + INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i,off) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME) +int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len); +#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \ + xfs_dir2_sf_entsize_byname(sfp,len) +#else +#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) /* space a name uses */ \ + ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ + ((sfp)->hdr.i8count == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY) +int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep); +#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \ + xfs_dir2_sf_entsize_byentry(sfp,sfep) +#else +#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) /* space an entry uses */ \ + ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \ + ((sfp)->hdr.i8count == 0) * \ + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_FIRSTENTRY) +xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp); +#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp) +#else +#define XFS_DIR2_SF_FIRSTENTRY(sfp) /* first entry in struct */ \ + ((xfs_dir2_sf_entry_t *) \ + ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_NEXTENTRY) +xfs_dir2_sf_entry_t *xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, + xfs_dir2_sf_entry_t *sfep); +#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep) +#else +#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) /* next entry in struct */ \ + ((xfs_dir2_sf_entry_t *) \ + ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep))) +#endif + +/* + * Functions. + */ + +extern int + xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_block *block, + xfs_dir2_sf_hdr_t *sfhp); + +extern int + xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); + +extern int + xfs_dir2_sf_addname(struct xfs_da_args *args); + +extern int + xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); + +extern int + xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp, + struct xfs_dirent *dbp, xfs_dir2_put_t put); + +extern int + xfs_dir2_sf_lookup(struct xfs_da_args *args); + +extern int + xfs_dir2_sf_removename(struct xfs_da_args *args); + +extern int + xfs_dir2_sf_replace(struct xfs_da_args *args); + +#endif /* __XFS_DIR2_SF_H__ */ diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c new file mode 100644 index 00000000000..9d641739314 --- /dev/null +++ b/fs/xfs/xfs_dir2_trace.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir2_trace.c + * Tracing for xfs v2 directories. + */ +#include "xfs.h" + +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_trace.h" + +#ifdef XFS_DIR2_TRACE +ktrace_t *xfs_dir2_trace_buf; + +/* + * Enter something in the trace buffers. + */ +static void +xfs_dir2_trace_enter( + xfs_inode_t *dp, + int type, + char *where, + char *name, + int namelen, + void *a0, + void *a1, + void *a2, + void *a3, + void *a4, + void *a5, + void *a6, + void *a7) +{ + void *n[5]; + + ASSERT(xfs_dir2_trace_buf); + ASSERT(dp->i_dir_trace); + if (name) + memcpy(n, name, min((int)sizeof(n), namelen)); + else + memset((char *)n, 0, sizeof(n)); + ktrace_enter(xfs_dir2_trace_buf, + (void *)(long)type, (void *)where, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)(long)namelen, + (void *)n[0], (void *)n[1], (void *)n[2], + (void *)n[3], (void *)n[4]); + ktrace_enter(dp->i_dir_trace, + (void *)(long)type, (void *)where, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)(long)namelen, + (void *)n[0], (void *)n[1], (void *)n[2], + (void *)n[3], (void *)n[4]); +} + +void +xfs_dir2_trace_args( + char *where, + xfs_da_args_t *args) +{ + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, NULL, NULL); +} + +void +xfs_dir2_trace_args_b( + char *where, + xfs_da_args_t *args, + xfs_dabuf_t *bp) +{ + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, + (void *)(bp ? bp->bps[0] : NULL), NULL); +} + +void +xfs_dir2_trace_args_bb( + char *where, + xfs_da_args_t *args, + xfs_dabuf_t *lbp, + xfs_dabuf_t *dbp) +{ + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, + (void *)(lbp ? lbp->bps[0] : NULL), + (void *)(dbp ? dbp->bps[0] : NULL)); +} + +void +xfs_dir2_trace_args_bibii( + char *where, + xfs_da_args_t *args, + xfs_dabuf_t *bs, + int ss, + xfs_dabuf_t *bd, + int sd, + int c) +{ + xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL; + xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL; + + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where, + (char *)args->name, (int)args->namelen, + (void *)args->dp, (void *)args->trans, + (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd, + (void *)(long)c, NULL); +} + +void +xfs_dir2_trace_args_db( + char *where, + xfs_da_args_t *args, + xfs_dir2_db_t db, + xfs_dabuf_t *bp) +{ + xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; + + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, (void *)(long)db, + (void *)dbp); +} + +void +xfs_dir2_trace_args_i( + char *where, + xfs_da_args_t *args, + xfs_ino_t i) +{ + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, + (void *)((unsigned long)(i >> 32)), + (void *)((unsigned long)(i & 0xFFFFFFFF))); +} + +void +xfs_dir2_trace_args_s( + char *where, + xfs_da_args_t *args, + int s) +{ + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL); +} + +void +xfs_dir2_trace_args_sb( + char *where, + xfs_da_args_t *args, + int s, + xfs_dabuf_t *bp) +{ + xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; + + xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where, + (char *)args->name, (int)args->namelen, + (void *)(unsigned long)args->hashval, + (void *)((unsigned long)(args->inumber >> 32)), + (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), + (void *)args->dp, (void *)args->trans, + (void *)(unsigned long)args->justcheck, (void *)(long)s, + (void *)dbp); +} +#endif /* XFS_DIR2_TRACE */ diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h new file mode 100644 index 00000000000..0a178bffa80 --- /dev/null +++ b/fs/xfs/xfs_dir2_trace.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR2_TRACE_H__ +#define __XFS_DIR2_TRACE_H__ + +/* + * Tracing for xfs v2 directories. + */ + +#if defined(XFS_DIR2_TRACE) + +struct ktrace; +struct xfs_dabuf; +struct xfs_da_args; + +#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */ +#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */ +extern struct ktrace *xfs_dir2_trace_buf; + +#define XFS_DIR2_KTRACE_ARGS 1 /* args only */ +#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */ +#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */ +#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */ +#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */ +#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */ +#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */ +#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */ + +void xfs_dir2_trace_args(char *where, struct xfs_da_args *args); +void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args, + struct xfs_dabuf *bp); +void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args, + struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); +void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args, + struct xfs_dabuf *bs, int ss, + struct xfs_dabuf *bd, int sd, int c); +void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args, + xfs_dir2_db_t db, struct xfs_dabuf *bp); +void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i); +void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s); +void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s, + struct xfs_dabuf *bp); + +#else /* XFS_DIR2_TRACE */ + +#define xfs_dir2_trace_args(where, args) +#define xfs_dir2_trace_args_b(where, args, bp) +#define xfs_dir2_trace_args_bb(where, args, lbp, dbp) +#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c) +#define xfs_dir2_trace_args_db(where, args, db, bp) +#define xfs_dir2_trace_args_i(where, args, i) +#define xfs_dir2_trace_args_s(where, args, s) +#define xfs_dir2_trace_args_sb(where, args, s, bp) + +#endif /* XFS_DIR2_TRACE */ + +#endif /* __XFS_DIR2_TRACE_H__ */ diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c new file mode 100644 index 00000000000..617018d6bbd --- /dev/null +++ b/fs/xfs/xfs_dir_leaf.c @@ -0,0 +1,2231 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * xfs_dir_leaf.c + * + * GROT: figure out how to recover gracefully when bmap returns ENOSPC. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" +#include "xfs_error.h" + +/* + * xfs_dir_leaf.c + * + * Routines to implement leaf blocks of directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, + int insertion_index, + int freemap_index); +STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer, + int musthave, int justcheck); +STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + xfs_da_state_blk_t *leaf_blk_2, + int *number_entries_in_blk1, + int *number_namebytes_in_blk1); + +/* + * Utility routines. + */ +STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf, + int src_start, + xfs_dir_leafblock_t *dst_leaf, + int dst_start, int move_count, + xfs_mount_t *mp); + + +/*======================================================================== + * External routines when dirsize < XFS_IFORK_DSIZE(dp). + *========================================================================*/ + + +/* + * Validate a given inode number. + */ +int +xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino) +{ + xfs_agblock_t agblkno; + xfs_agino_t agino; + xfs_agnumber_t agno; + int ino_ok; + int ioff; + + agno = XFS_INO_TO_AGNO(mp, ino); + agblkno = XFS_INO_TO_AGBNO(mp, ino); + ioff = XFS_INO_TO_OFFSET(mp, ino); + agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); + ino_ok = + agno < mp->m_sb.sb_agcount && + agblkno < mp->m_sb.sb_agblocks && + agblkno != 0 && + ioff < (1 << mp->m_sb.sb_inopblog) && + XFS_AGINO_TO_INO(mp, agno, agino) == ino; + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, + XFS_RANDOM_DIR_INO_VALIDATE))) { + xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", + (unsigned long long) ino); + XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Create the initial contents of a shortform directory. + */ +int +xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent) +{ + xfs_dir_sf_hdr_t *hdr; + xfs_inode_t *dp; + + dp = args->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_d.di_size == 0); + if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + dp->i_df.if_flags |= XFS_IFINLINE; + } + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK); + hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data; + XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent); + + hdr->count = 0; + dp->i_d.di_size = sizeof(*hdr); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return(0); +} + +/* + * Add a name to the shortform directory structure. + * Overflow from the inode has already been checked for. + */ +int +xfs_dir_shortform_addname(xfs_da_args_t *args) +{ + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int i, offset, size; + xfs_inode_t *dp; + + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Catch the case where the conversion from shortform to leaf + * failed part way through. + */ + if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + sfe = &sf->list[0]; + for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) { + if (sfe->namelen == args->namelen && + args->name[0] == sfe->name[0] && + memcmp(args->name, sfe->name, args->namelen) == 0) + return(XFS_ERROR(EEXIST)); + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + + offset = (int)((char *)sfe - (char *)sf); + size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset); + + XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber); + sfe->namelen = args->namelen; + memcpy(sfe->name, args->name, sfe->namelen); + INT_MOD(sf->hdr.count, ARCH_CONVERT, +1); + + dp->i_d.di_size += size; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + + return(0); +} + +/* + * Remove a name from the shortform directory structure. + */ +int +xfs_dir_shortform_removename(xfs_da_args_t *args) +{ + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int base, size = 0, i; + xfs_inode_t *dp; + + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Catch the case where the conversion from shortform to leaf + * failed part way through. + */ + if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + base = sizeof(xfs_dir_sf_hdr_t); + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + sfe = &sf->list[0]; + for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) { + size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe); + if (sfe->namelen == args->namelen && + sfe->name[0] == args->name[0] && + memcmp(sfe->name, args->name, args->namelen) == 0) + break; + base += size; + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + if (i < 0) { + ASSERT(args->oknoent); + return(XFS_ERROR(ENOENT)); + } + + if ((base + size) != dp->i_d.di_size) { + memmove(&((char *)sf)[base], &((char *)sf)[base+size], + dp->i_d.di_size - (base+size)); + } + INT_MOD(sf->hdr.count, ARCH_CONVERT, -1); + + xfs_idata_realloc(dp, -size, XFS_DATA_FORK); + dp->i_d.di_size -= size; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + + return(0); +} + +/* + * Look up a name in a shortform directory structure. + */ +int +xfs_dir_shortform_lookup(xfs_da_args_t *args) +{ + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int i; + xfs_inode_t *dp; + + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Catch the case where the conversion from shortform to leaf + * failed part way through. + */ + if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber); + return(XFS_ERROR(EEXIST)); + } + if (args->namelen == 1 && args->name[0] == '.') { + args->inumber = dp->i_ino; + return(XFS_ERROR(EEXIST)); + } + sfe = &sf->list[0]; + for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) { + if (sfe->namelen == args->namelen && + sfe->name[0] == args->name[0] && + memcmp(args->name, sfe->name, args->namelen) == 0) { + XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber); + return(XFS_ERROR(EEXIST)); + } + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + ASSERT(args->oknoent); + return(XFS_ERROR(ENOENT)); +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs) +{ + xfs_inode_t *dp; + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + xfs_da_args_t args; + xfs_ino_t inumber; + char *tmpbuffer; + int retval, i, size; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + dp = iargs->dp; + /* + * Catch the case where the conversion from shortform to leaf + * failed part way through. + */ + if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + size = dp->i_df.if_bytes; + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + + memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size); + + sf = (xfs_dir_shortform_t *)tmpbuffer; + XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber); + + xfs_idata_realloc(dp, -size, XFS_DATA_FORK); + dp->i_d.di_size = 0; + xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE); + retval = xfs_da_grow_inode(iargs, &blkno); + if (retval) + goto out; + + ASSERT(blkno == 0); + retval = xfs_dir_leaf_create(iargs, blkno, &bp); + if (retval) + goto out; + xfs_da_buf_done(bp); + + args.name = "."; + args.namelen = 1; + args.hashval = xfs_dir_hash_dot; + args.inumber = dp->i_ino; + args.dp = dp; + args.firstblock = iargs->firstblock; + args.flist = iargs->flist; + args.total = iargs->total; + args.whichfork = XFS_DATA_FORK; + args.trans = iargs->trans; + args.justcheck = 0; + args.addname = args.oknoent = 1; + retval = xfs_dir_leaf_addname(&args); + if (retval) + goto out; + + args.name = ".."; + args.namelen = 2; + args.hashval = xfs_dir_hash_dotdot; + args.inumber = inumber; + retval = xfs_dir_leaf_addname(&args); + if (retval) + goto out; + + sfe = &sf->list[0]; + for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + args.name = (char *)(sfe->name); + args.namelen = sfe->namelen; + args.hashval = xfs_da_hashname((char *)(sfe->name), + sfe->namelen); + XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber); + retval = xfs_dir_leaf_addname(&args); + if (retval) + goto out; + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + retval = 0; + +out: + kmem_free(tmpbuffer, size); + return(retval); +} + +STATIC int +xfs_dir_shortform_compare(const void *a, const void *b) +{ + xfs_dir_sf_sort_t *sa, *sb; + + sa = (xfs_dir_sf_sort_t *)a; + sb = (xfs_dir_sf_sort_t *)b; + if (sa->hash < sb->hash) + return -1; + else if (sa->hash > sb->hash) + return 1; + else + return sa->entno - sb->entno; +} + +/* + * Copy out directory entries for getdents(), for shortform directories. + */ +/*ARGSUSED*/ +int +xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp, + xfs_dirent_t *dbp, xfs_dir_put_t put) +{ + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + int retval, i, sbsize, nsbuf, lastresid=0, want_entno; + xfs_mount_t *mp; + xfs_dahash_t cookhash, hash; + xfs_dir_put_args_t p; + xfs_dir_sf_sort_t *sbuf, *sbp; + + mp = dp->i_mount; + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset); + want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset); + nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2; + sbsize = (nsbuf + 1) * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); + + xfs_dir_trace_g_du("sf: start", dp, uio); + + /* + * Collect all the entries into the buffer. + * Entry 0 is . + */ + sbp->entno = 0; + sbp->seqno = 0; + sbp->hash = xfs_dir_hash_dot; + sbp->ino = dp->i_ino; + sbp->name = "."; + sbp->namelen = 1; + sbp++; + + /* + * Entry 1 is .. + */ + sbp->entno = 1; + sbp->seqno = 0; + sbp->hash = xfs_dir_hash_dotdot; + sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent); + sbp->name = ".."; + sbp->namelen = 2; + sbp++; + + /* + * Scan the directory data for the rest of the entries. + */ + for (i = 0, sfe = &sf->list[0]; + i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) { + + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) { + xfs_dir_trace_g_du("sf: corrupted", dp, uio); + XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents", + XFS_ERRLEVEL_LOW, mp, sfe); + kmem_free(sbuf, sbsize); + return XFS_ERROR(EFSCORRUPTED); + } + + sbp->entno = i + 2; + sbp->seqno = 0; + sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen); + sbp->ino = XFS_GET_DIR_INO8(sfe->inumber); + sbp->name = (char *)sfe->name; + sbp->namelen = sfe->namelen; + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + sbp++; + } + + /* + * Sort the entries on hash then entno. + */ + qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare); + /* + * Stuff in last entry. + */ + sbp->entno = nsbuf; + sbp->hash = XFS_DA_MAXHASH; + sbp->seqno = 0; + /* + * Figure out the sequence numbers in case there's a hash duplicate. + */ + for (hash = sbuf->hash, sbp = sbuf + 1; + sbp < &sbuf[nsbuf + 1]; sbp++) { + if (sbp->hash == hash) + sbp->seqno = sbp[-1].seqno + 1; + else + hash = sbp->hash; + } + + /* + * Set up put routine. + */ + p.dbp = dbp; + p.put = put; + p.uio = uio; + + /* + * Find our place. + */ + for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) { + if (sbp->hash > cookhash || + (sbp->hash == cookhash && sbp->seqno >= want_entno)) + break; + } + + /* + * Did we fail to find anything? We stop at the last entry, + * the one we put maxhash into. + */ + if (sbp == &sbuf[nsbuf]) { + kmem_free(sbuf, sbsize); + xfs_dir_trace_g_du("sf: hash beyond end", dp, uio); + uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH); + *eofp = 1; + return 0; + } + + /* + * Loop putting entries into the user buffer. + */ + while (sbp < &sbuf[nsbuf]) { + /* + * Save the first resid in a run of equal-hashval entries + * so that we can back them out if they don't all fit. + */ + if (sbp->seqno == 0 || sbp == sbuf) + lastresid = uio->uio_resid; + XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash); + p.ino = sbp->ino; +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = sbp->name; + p.namelen = sbp->namelen; + retval = p.put(&p); + if (!p.done) { + uio->uio_offset = + XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash); + kmem_free(sbuf, sbsize); + uio->uio_resid = lastresid; + xfs_dir_trace_g_du("sf: E-O-B", dp, uio); + return retval; + } + sbp++; + } + kmem_free(sbuf, sbsize); + uio->uio_offset = p.cook.o; + *eofp = 1; + xfs_dir_trace_g_du("sf: E-O-F", dp, uio); + return 0; +} + +/* + * Look up a name in a shortform directory structure, replace the inode number. + */ +int +xfs_dir_shortform_replace(xfs_da_args_t *args) +{ + xfs_dir_shortform_t *sf; + xfs_dir_sf_entry_t *sfe; + xfs_inode_t *dp; + int i; + + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Catch the case where the conversion from shortform to leaf + * failed part way through. + */ + if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data; + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + /* XXX - replace assert? */ + XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return(0); + } + ASSERT(args->namelen != 1 || args->name[0] != '.'); + sfe = &sf->list[0]; + for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) { + if (sfe->namelen == args->namelen && + sfe->name[0] == args->name[0] && + memcmp(args->name, sfe->name, args->namelen) == 0) { + ASSERT(memcmp((char *)&args->inumber, + (char *)&sfe->inumber, sizeof(xfs_ino_t))); + XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return(0); + } + sfe = XFS_DIR_SF_NEXTENTRY(sfe); + } + ASSERT(args->oknoent); + return(XFS_ERROR(ENOENT)); +} + +/* + * Convert a leaf directory to shortform structure + */ +int +xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_hdr_t *hdr; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + xfs_da_args_t args; + xfs_inode_t *dp; + xfs_ino_t parent; + char *tmpbuffer; + int retval, i; + xfs_dabuf_t *bp; + + dp = iargs->dp; + tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + ASSERT(tmpbuffer != NULL); + + retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp, + XFS_DATA_FORK); + if (retval) + goto out; + ASSERT(bp != NULL); + memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_dir_leafblock_t *)tmpbuffer; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + + /* + * Find and special case the parent inode number + */ + hdr = &leaf->hdr; + entry = &leaf->entries[0]; + for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) { + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + if ((entry->namelen == 2) && + (namest->name[0] == '.') && + (namest->name[1] == '.')) { + XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent); + entry->nameidx = 0; + } else if ((entry->namelen == 1) && (namest->name[0] == '.')) { + entry->nameidx = 0; + } + } + retval = xfs_da_shrink_inode(iargs, 0, bp); + if (retval) + goto out; + retval = xfs_dir_shortform_create(iargs, parent); + if (retval) + goto out; + + /* + * Copy the rest of the filenames + */ + entry = &leaf->entries[0]; + args.dp = dp; + args.firstblock = iargs->firstblock; + args.flist = iargs->flist; + args.total = iargs->total; + args.whichfork = XFS_DATA_FORK; + args.trans = iargs->trans; + args.justcheck = 0; + args.addname = args.oknoent = 1; + for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) { + if (!entry->nameidx) + continue; + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + args.name = (char *)(namest->name); + args.namelen = entry->namelen; + args.hashval = INT_GET(entry->hashval, ARCH_CONVERT); + XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber); + xfs_dir_shortform_addname(&args); + } + +out: + kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); + return(retval); +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_dir_leaf_to_node(xfs_da_args_t *args) +{ + xfs_dir_leafblock_t *leaf; + xfs_da_intnode_t *node; + xfs_inode_t *dp; + xfs_dabuf_t *bp1, *bp2; + xfs_dablk_t blkno; + int retval; + + dp = args->dp; + retval = xfs_da_grow_inode(args, &blkno); + ASSERT(blkno == 1); + if (retval) + return(retval); + retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, + XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp1 != NULL); + retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2, + XFS_DATA_FORK); + if (retval) { + xfs_da_buf_done(bp1); + return(retval); + } + ASSERT(bp2 != NULL); + memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); + xfs_da_buf_done(bp1); + xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + + /* + * Set up the new root node. + */ + retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK); + if (retval) { + xfs_da_buf_done(bp2); + return(retval); + } + node = bp1->data; + leaf = bp2->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)); + xfs_da_buf_done(bp2); + INT_SET(node->btree[0].before, ARCH_CONVERT, blkno); + INT_SET(node->hdr.count, ARCH_CONVERT, 1); + xfs_da_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0]))); + xfs_da_buf_done(bp1); + + return(retval); +} + + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf directory + * or a leaf in a node directory. + */ +int +xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_dabuf_t *bp; + int retval; + + dp = args->dp; + ASSERT(dp != NULL); + retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK); + if (retval) + return(retval); + ASSERT(bp != NULL); + leaf = bp->data; + memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); + hdr = &leaf->hdr; + INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC); + INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount)); + if (!hdr->firstused) + INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1); + INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t)); + INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT)); + + xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + + *bpp = bp; + return(0); +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, + xfs_da_state_blk_t *newblk) +{ + xfs_dablk_t blkno; + xfs_da_args_t *args; + int error; + + /* + * Allocate space for a new leaf node. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC); + error = xfs_da_grow_inode(args, &blkno); + if (error) + return(error); + error = xfs_dir_leaf_create(args, blkno, &newblk->bp); + if (error) + return(error); + newblk->blkno = blkno; + newblk->magic = XFS_DIR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + */ + xfs_dir_leaf_rebalance(state, oldblk, newblk); + error = xfs_da_blk_link(state, oldblk, newblk); + if (error) + return(error); + + /* + * Insert the new entry in the correct block. + */ + if (state->inleaf) { + error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index); + } else { + error = xfs_dir_leaf_add(newblk->bp, args, newblk->index); + } + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL); + return(error); +} + +/* + * Add a name to the leaf directory structure. + * + * Must take into account fragmented leaves and leaves where spacemap has + * lost some freespace information (ie: holes). + */ +int +xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_hdr_t *hdr; + xfs_dir_leaf_map_t *map; + int tablesize, entsize, sum, i, tmp, error; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT))); + hdr = &leaf->hdr; + entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_hdr_t); + map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1]; + for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { + if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) { + sum += INT_GET(map->size, ARCH_CONVERT); + continue; + } + if (!map->size) + continue; /* no space in this map */ + tmp = entsize; + if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT)) + tmp += (uint)sizeof(xfs_dir_leaf_entry_t); + if (INT_GET(map->size, ARCH_CONVERT) >= tmp) { + if (!args->justcheck) + xfs_dir_leaf_add_work(bp, args, index, i); + return(0); + } + sum += INT_GET(map->size, ARCH_CONVERT); + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!hdr->holes && (sum < entsize)) + return(XFS_ERROR(ENOSPC)); + + /* + * Compact the entries to coalesce free space. + * Pass the justcheck flag so the checking pass can return + * an error, without changing anything, if it won't fit. + */ + error = xfs_dir_leaf_compact(args->trans, bp, + args->total == 0 ? + entsize + + (uint)sizeof(xfs_dir_leaf_entry_t) : 0, + args->justcheck); + if (error) + return(error); + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) < + (entsize + (uint)sizeof(xfs_dir_leaf_entry_t))) + return(XFS_ERROR(ENOSPC)); + + if (!args->justcheck) + xfs_dir_leaf_add_work(bp, args, index, 0); + return(0); +} + +/* + * Add a name to a leaf directory structure. + */ +STATIC void +xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index, + int mapindex) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_hdr_t *hdr; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + xfs_dir_leaf_map_t *map; + /* REFERENCED */ + xfs_mount_t *mp; + int tmp, i; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + hdr = &leaf->hdr; + ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE)); + ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT))); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &leaf->entries[index]; + if (index < INT_GET(hdr->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr->count, ARCH_CONVERT) - index; + tmp *= (uint)sizeof(xfs_dir_leaf_entry_t); + memmove(entry + 1, entry, tmp); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry))); + } + INT_MOD(hdr->count, ARCH_CONVERT, +1); + + /* + * Allocate space for the new string (at the end of the run). + */ + map = &hdr->freemap[mapindex]; + mp = args->trans->t_mountp; + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen))); + INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)); + INT_SET(entry->hashval, ARCH_CONVERT, args->hashval); + entry->namelen = args->namelen; + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + /* + * Copy the string and inode number into the new space. + */ + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber); + memcpy(namest->name, args->name, args->namelen); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry))); + + /* + * Update the control info for this leaf node + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT)) + INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT); + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr))); + tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_hdr_t); + map = &hdr->freemap[0]; + for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) { + if (INT_GET(map->base, ARCH_CONVERT) == tmp) { + INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t)); + INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t))); + } + } + INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen); + xfs_da_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); +} + +/* + * Garbage collect a leaf directory block by copying it to a new buffer. + */ +STATIC int +xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave, + int justcheck) +{ + xfs_dir_leafblock_t *leaf_s, *leaf_d; + xfs_dir_leaf_hdr_t *hdr_s, *hdr_d; + xfs_mount_t *mp; + char *tmpbuffer; + char *tmpbuffer2=NULL; + int rval; + int lbsize; + + mp = trans->t_mountp; + lbsize = XFS_LBSIZE(mp); + tmpbuffer = kmem_alloc(lbsize, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, bp->data, lbsize); + + /* + * Make a second copy in case xfs_dir_leaf_moveents() + * below destroys the original. + */ + if (musthave || justcheck) { + tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP); + memcpy(tmpbuffer2, bp->data, lbsize); + } + memset(bp->data, 0, lbsize); + + /* + * Copy basic information + */ + leaf_s = (xfs_dir_leafblock_t *)tmpbuffer; + leaf_d = bp->data; + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + hdr_d->info = hdr_s->info; /* struct copy */ + INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize); + if (!hdr_d->firstused) + INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1); + hdr_d->namebytes = 0; + hdr_d->count = 0; + hdr_d->holes = 0; + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate filenames packed and in sequence. + * This changes the source (leaf_s) as well. + */ + xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp); + + if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave) + rval = XFS_ERROR(ENOSPC); + else + rval = 0; + + if (justcheck || rval == ENOSPC) { + ASSERT(tmpbuffer2); + memcpy(bp->data, tmpbuffer2, lbsize); + } else { + xfs_da_log_buf(trans, bp, 0, lbsize - 1); + } + + kmem_free(tmpbuffer, lbsize); + if (musthave || justcheck) + kmem_free(tmpbuffer2, lbsize); + return(rval); +} + +/* + * Redistribute the directory entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of old block. + */ +STATIC void +xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2) +{ + xfs_da_state_blk_t *tmp_blk; + xfs_dir_leafblock_t *leaf1, *leaf2; + xfs_dir_leaf_hdr_t *hdr1, *hdr2; + int count, totallen, max, space, swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC); + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + + /* + * Check ordering of blocks, reverse if it makes things simpler. + */ + swap = 0; + if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) { + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + swap = 1; + } + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + */ + state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + count = INT_GET(hdr1->count, ARCH_CONVERT) - count; /* number entries being moved */ + space = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen; + space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1); + space += count * (uint)sizeof(xfs_dir_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t); + max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t); + if (space > max) { + xfs_dir_leaf_compact(state->args->trans, blk2->bp, + 0, 0); + } + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count, + leaf2, 0, count, state->mp); + + xfs_da_log_buf(state->args->trans, blk1->bp, 0, + state->blocksize-1); + xfs_da_log_buf(state->args->trans, blk2->bp, 0, + state->blocksize-1); + + } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + count -= INT_GET(hdr1->count, ARCH_CONVERT); /* number entries being moved */ + space = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT); + space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1); + space += count * (uint)sizeof(xfs_dir_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t); + max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t); + if (space > max) { + xfs_dir_leaf_compact(state->args->trans, blk1->bp, + 0, 0); + } + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT), + count, state->mp); + + xfs_da_log_buf(state->args->trans, blk1->bp, 0, + state->blocksize-1); + xfs_da_log_buf(state->args->trans, blk2->bp, 0, + state->blocksize-1); + } + + /* + * Copy out last hashval in each block for B-tree code. + */ + blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); + + /* + * Adjust the expected index for insertion. + * GROT: this doesn't work unless blk2 was originally empty. + */ + if (!state->inleaf) { + blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT); + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_dir_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2, + int *countarg, int *namebytesarg) +{ + xfs_dir_leafblock_t *leaf1, *leaf2; + xfs_dir_leaf_hdr_t *hdr1, *hdr2; + xfs_dir_leaf_entry_t *entry; + int count, max, totallen, half; + int lastdelta, foundit, tmp; + + /* + * Set up environment. + */ + leaf1 = blk1->bp->data; + leaf2 = blk2->bp->data; + hdr1 = &leaf1->hdr; + hdr2 = &leaf2->hdr; + foundit = 0; + totallen = 0; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT); + half = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1); + half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen; + half /= 2; + lastdelta = state->blocksize; + entry = &leaf1->entries[0]; + for (count = 0; count < max; entry++, count++) { + +#define XFS_DIR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + (uint)sizeof(*entry) + + XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen); + if (XFS_DIR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_DIR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == INT_GET(hdr1->count, ARCH_CONVERT)) { + leaf1 = leaf2; + entry = &leaf1->entries[0]; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + (uint)sizeof(*entry) + + XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry); + if (XFS_DIR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_DIR_ABS(half - tmp); + totallen = tmp; +#undef XFS_DIR_ABS + } + + /* + * Calculate the number of namebytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= + count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1); + if (foundit) { + totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) + + state->args->namelen; + } + + *countarg = count; + *namebytesarg = totallen; + return(foundit); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +int +xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action) +{ + xfs_dir_leafblock_t *leaf; + xfs_da_state_blk_t *blk; + xfs_da_blkinfo_t *info; + int count, bytes, forward, error, retval, i; + xfs_dablk_t blkno; + xfs_dabuf_t *bp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->data; + ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + leaf = (xfs_dir_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) + + count * (uint)sizeof(xfs_dir_leaf_entry_t) + + count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) + + INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + if (bytes > (state->blocksize >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return(0); + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (aribtrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = info->forw; + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 2; + } + return(0); + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT)); /* start with smaller blk num */ + for (i = 0; i < 2; forward = !forward, i++) { + if (forward) + blkno = INT_GET(info->forw, ARCH_CONVERT); + else + blkno = INT_GET(info->back, ARCH_CONVERT); + if (blkno == 0) + continue; + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, + XFS_DATA_FORK); + if (error) + return(error); + ASSERT(bp != NULL); + + leaf = (xfs_dir_leafblock_t *)info; + count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes = state->blocksize - (state->blocksize>>2); + bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + count += INT_GET(leaf->hdr.count, ARCH_CONVERT); + bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1); + bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t); + bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + + xfs_da_brelse(state->args->trans, bp); + } + if (i >= 2) { + *action = 0; + return(0); + } + xfs_da_buf_done(bp); + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return(error); + if (retval) { + *action = 0; + } else { + *action = 1; + } + return(0); +} + +/* + * Remove a name from the leaf directory structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_hdr_t *hdr; + xfs_dir_leaf_map_t *map; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + int before, after, smallest, entsize; + int tablesize, tmp, i; + xfs_mount_t *mp; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + hdr = &leaf->hdr; + mp = trans->t_mountp; + ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8))); + ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT))); + ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr))); + entry = &leaf->entries[index]; + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp)); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_hdr_t); + map = &hdr->freemap[0]; + tmp = INT_GET(map->size, ARCH_CONVERT); + before = after = -1; + smallest = XFS_DIR_LEAF_MAPSIZE - 1; + entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry); + for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) { + ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp)); + ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp)); + if (INT_GET(map->base, ARCH_CONVERT) == tablesize) { + INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t))); + INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t)); + } + + if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) { + before = i; + } else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) { + after = i; + } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) { + tmp = INT_GET(map->size, ARCH_CONVERT); + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT)); + hdr->freemap[after].base = 0; + hdr->freemap[after].size = 0; + } else if (before >= 0) { + map = &hdr->freemap[before]; + INT_MOD(map->size, ARCH_CONVERT, entsize); + } else { + map = &hdr->freemap[after]; + INT_COPY(map->base, entry->nameidx, ARCH_CONVERT); + INT_MOD(map->size, ARCH_CONVERT, entsize); + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + map = &hdr->freemap[smallest]; + if (INT_GET(map->size, ARCH_CONVERT) < entsize) { + INT_COPY(map->base, entry->nameidx, ARCH_CONVERT); + INT_SET(map->size, ARCH_CONVERT, entsize); + } + } + + /* + * Did we remove the first entry? + */ + if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT)) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + memset((char *)namest, 0, entsize); + xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize)); + + INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen)); + tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t); + memmove(entry, entry + 1, tmp); + INT_MOD(hdr->count, ARCH_CONVERT, -1); + xfs_da_log_buf(trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry))); + entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)]; + memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = XFS_LBSIZE(mp); + entry = &leaf->entries[0]; + for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) { + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT)); + ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp)); + if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp) + tmp = INT_GET(entry->nameidx, ARCH_CONVERT); + } + INT_SET(hdr->firstused, ARCH_CONVERT, tmp); + if (!hdr->firstused) + INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1); + } else { + hdr->holes = 1; /* mark as needing compaction */ + } + + xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = (uint)sizeof(xfs_dir_leaf_hdr_t); + tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t); + tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1); + tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT); + if (tmp < mp->m_dir_magicpct) + return(1); /* leaf is < 37% full */ + return(0); +} + +/* + * Move all the directory entries from drop_leaf into save_leaf. + */ +void +xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk) +{ + xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; + xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; + xfs_mount_t *mp; + char *tmpbuffer; + + /* + * Set up environment. + */ + mp = state->mp; + ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC); + ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC); + drop_leaf = drop_blk->bp->data; + save_leaf = save_blk->bp->data; + ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + drop_hdr = &drop_leaf->hdr; + save_hdr = &save_leaf->hdr; + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (save_hdr->holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + } else { + xfs_dir_leaf_moveents(drop_leaf, 0, + save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memset(tmpbuffer, 0, state->blocksize); + tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer; + tmp_hdr = &tmp_leaf->hdr; + tmp_hdr->info = save_hdr->info; /* struct copy */ + tmp_hdr->count = 0; + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize); + if (!tmp_hdr->firstused) + INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1); + tmp_hdr->namebytes = 0; + if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) { + xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + xfs_dir_leaf_moveents(save_leaf, 0, + tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp); + } else { + xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0, + (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp); + xfs_dir_leaf_moveents(drop_leaf, 0, + tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT), + (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp); + } + memcpy(save_leaf, tmp_leaf, state->blocksize); + kmem_free(tmpbuffer, state->blocksize); + } + + xfs_da_log_buf(state->args->trans, save_blk->bp, 0, + state->blocksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf directory structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in *index the index into the entry[] array of either the found + * entry, or where the entry should have been (insert before that entry). + * + * Don't change the args->inumber unless we find the filename. + */ +int +xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + int probe, span; + xfs_dahash_t hashval; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8)); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2; + for (entry = &leaf->entries[probe]; span > 4; + entry = &leaf->entries[probe]) { + span /= 2; + if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval) + probe += span; + else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && \ + ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)))); + ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) { + entry--; + probe--; + } + while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) { + entry++; + probe++; + } + if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) { + *index = probe; + ASSERT(args->oknoent); + return(XFS_ERROR(ENOENT)); + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) { + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT)); + if (entry->namelen == args->namelen && + namest->name[0] == args->name[0] && + memcmp(args->name, namest->name, args->namelen) == 0) { + XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber); + *index = probe; + return(XFS_ERROR(EEXIST)); + } + entry++; + probe++; + } + *index = probe; + ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent); + return(XFS_ERROR(ENOENT)); +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/* ARGSUSED */ +STATIC void +xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s, + xfs_dir_leafblock_t *leaf_d, int start_d, + int count, xfs_mount_t *mp) +{ + xfs_dir_leaf_hdr_t *hdr_s, *hdr_d; + xfs_dir_leaf_entry_t *entry_s, *entry_d; + int tmp, i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + hdr_s = &leaf_s->hdr; + hdr_d = &leaf_d->hdr; + ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8))); + ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s))); + ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= + ((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d))); + + ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT)); + ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT)); + ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT)); + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) { + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d; + tmp *= (uint)sizeof(xfs_dir_leaf_entry_t); + entry_s = &leaf_d->entries[start_d]; + entry_d = &leaf_d->entries[start_d + count]; + memcpy(entry_d, entry_s, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate filenames packed and in sequence. + */ + entry_s = &leaf_s->entries[start_s]; + entry_d = &leaf_d->entries[start_d]; + for (i = 0; i < count; entry_s++, entry_d++, i++) { + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT)); + tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s); + INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp)); + entry_d->hashval = entry_s->hashval; /* INT_: direct copy */ + INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT); + entry_d->namelen = entry_s->namelen; + ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp)); + memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)), + XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp); + ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp)); + memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), + 0, tmp); + INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen)); + INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen); + INT_MOD(hdr_s->count, ARCH_CONVERT, -1); + INT_MOD(hdr_d->count, ARCH_CONVERT, +1); + tmp = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t) + + (uint)sizeof(xfs_dir_leaf_hdr_t); + ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp); + + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) { + tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t); + entry_s = &leaf_s->entries[start_s]; + ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp)); + memset((char *)entry_s, 0, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = INT_GET(hdr_s->count, ARCH_CONVERT) - count; + tmp *= (uint)sizeof(xfs_dir_leaf_entry_t); + entry_s = &leaf_s->entries[start_s + count]; + entry_d = &leaf_s->entries[start_s]; + memcpy(entry_d, entry_s, tmp); + + tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t); + entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)]; + ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp)); + memset((char *)entry_s, 0, tmp); + } + + /* + * Fill in the freemap information + */ + INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t)); + INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)); + INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT)); + INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0)); + INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0)); + hdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Compare two leaf blocks "order". + */ +int +xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) +{ + xfs_dir_leafblock_t *leaf1, *leaf2; + + leaf1 = leaf1_bp->data; + leaf2 = leaf2_bp->data; + ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) && + (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC)); + if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) && + ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) || + (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) < + INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) { + return(1); + } + return(0); +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count) +{ + xfs_dir_leafblock_t *leaf; + + leaf = bp->data; + ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC); + if (count) + *count = INT_GET(leaf->hdr.count, ARCH_CONVERT); + if (!leaf->hdr.count) + return(0); + return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)); +} + +/* + * Copy out directory entries for getdents(), for leaf directories. + */ +int +xfs_dir_leaf_getdents_int( + xfs_dabuf_t *bp, + xfs_inode_t *dp, + xfs_dablk_t bno, + uio_t *uio, + int *eobp, + xfs_dirent_t *dbp, + xfs_dir_put_t put, + xfs_daddr_t nextda) +{ + xfs_dir_leafblock_t *leaf; + xfs_dir_leaf_entry_t *entry; + xfs_dir_leaf_name_t *namest; + int entno, want_entno, i, nextentno; + xfs_mount_t *mp; + xfs_dahash_t cookhash; + xfs_dahash_t nexthash = 0; +#if (BITS_PER_LONG == 32) + xfs_dahash_t lasthash = XFS_DA_MAXHASH; +#endif + xfs_dir_put_args_t p; + + mp = dp->i_mount; + leaf = bp->data; + if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) { + *eobp = 1; + return(XFS_ERROR(ENOENT)); /* XXX wrong code */ + } + + want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset); + + cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset); + + xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf); + + /* + * Re-find our place. + */ + for (i = entno = 0, entry = &leaf->entries[0]; + i < INT_GET(leaf->hdr.count, ARCH_CONVERT); + entry++, i++) { + + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, + INT_GET(entry->nameidx, ARCH_CONVERT)); + + if (unlikely( + ((char *)namest < (char *)leaf) || + ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) { + XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)", + XFS_ERRLEVEL_LOW, mp, leaf); + xfs_dir_trace_g_du("leaf: corrupted", dp, uio); + return XFS_ERROR(EFSCORRUPTED); + } + if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) { + if ( entno < want_entno + && INT_GET(entry->hashval, ARCH_CONVERT) + == cookhash) { + /* + * Trying to get to a particular offset in a + * run of equal-hashval entries. + */ + entno++; + } else if ( want_entno > 0 + && entno == want_entno + && INT_GET(entry->hashval, ARCH_CONVERT) + == cookhash) { + break; + } else { + entno = 0; + break; + } + } + } + + if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) { + xfs_dir_trace_g_du("leaf: hash not found", dp, uio); + if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT)) + uio->uio_offset = + XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH); + /* + * Don't set uio_offset if there's another block: + * the node code will be setting uio_offset anyway. + */ + *eobp = 0; + return(0); + } + xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry); + + p.dbp = dbp; + p.put = put; + p.uio = uio; + + /* + * We're synchronized, start copying entries out to the user. + */ + for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT); + entry++, i++, (entno = nextentno)) { + int lastresid=0, retval; + xfs_dircook_t lastoffset; + xfs_dahash_t thishash; + + /* + * Check for a damaged directory leaf block and pick up + * the inode number from this entry. + */ + namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, + INT_GET(entry->nameidx, ARCH_CONVERT)); + + if (unlikely( + ((char *)namest < (char *)leaf) || + ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) { + XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)", + XFS_ERRLEVEL_LOW, mp, leaf); + xfs_dir_trace_g_du("leaf: corrupted", dp, uio); + return XFS_ERROR(EFSCORRUPTED); + } + + xfs_dir_trace_g_duc("leaf: middle cookie ", + dp, uio, p.cook.o); + + if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) { + nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT); + + if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT)) + nextentno = entno + 1; + else + nextentno = 0; + XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash); + xfs_dir_trace_g_duc("leaf: middle cookie ", + dp, uio, p.cook.o); + + } else if ((thishash = INT_GET(leaf->hdr.info.forw, + ARCH_CONVERT))) { + xfs_dabuf_t *bp2; + xfs_dir_leafblock_t *leaf2; + + ASSERT(nextda != -1); + + retval = xfs_da_read_buf(dp->i_transp, dp, thishash, + nextda, &bp2, XFS_DATA_FORK); + if (retval) + return(retval); + + ASSERT(bp2 != NULL); + + leaf2 = bp2->data; + + if (unlikely( + (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) + != XFS_DIR_LEAF_MAGIC) + || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT) + != bno))) { /* GROT */ + XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)", + XFS_ERRLEVEL_LOW, mp, + leaf2); + xfs_da_brelse(dp->i_transp, bp2); + + return(XFS_ERROR(EFSCORRUPTED)); + } + + nexthash = INT_GET(leaf2->entries[0].hashval, + ARCH_CONVERT); + nextentno = -1; + XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash); + xfs_da_brelse(dp->i_transp, bp2); + xfs_dir_trace_g_duc("leaf: next blk cookie", + dp, uio, p.cook.o); + } else { + nextentno = -1; + XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH); + } + + /* + * Save off the cookie so we can fall back should the + * 'put' into the outgoing buffer fails. To handle a run + * of equal-hashvals, the off_t structure on 64bit + * builds has entno built into the cookie to ID the + * entry. On 32bit builds, we only have space for the + * hashval so we can't ID specific entries within a group + * of same hashval entries. For this, lastoffset is set + * to the first in the run of equal hashvals so we don't + * include any entries unless we can include all entries + * that share the same hashval. Hopefully the buffer + * provided is big enough to handle it (see pv763517). + */ +#if (BITS_PER_LONG == 32) + if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT)) + != lasthash) { + XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash); + lastresid = uio->uio_resid; + lasthash = thishash; + } else { + xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped", + dp, uio, p.cook.o); + } +#else + thishash = INT_GET(entry->hashval, ARCH_CONVERT); + XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash); + lastresid = uio->uio_resid; +#endif /* BITS_PER_LONG == 32 */ + + /* + * Put the current entry into the outgoing buffer. If we fail + * then restore the UIO to the first entry in the current + * run of equal-hashval entries (probably one 1 entry long). + */ + p.ino = XFS_GET_DIR_INO8(namest->inumber); +#if XFS_BIG_INUMS + p.ino += mp->m_inoadd; +#endif + p.name = (char *)namest->name; + p.namelen = entry->namelen; + + retval = p.put(&p); + + if (!p.done) { + uio->uio_offset = lastoffset.o; + uio->uio_resid = lastresid; + + *eobp = 1; + + xfs_dir_trace_g_du("leaf: E-O-B", dp, uio); + + return(retval); + } + } + + uio->uio_offset = p.cook.o; + + *eobp = 0; + + xfs_dir_trace_g_du("leaf: E-O-F", dp, uio); + + return(0); +} + +/* + * Format a dirent64 structure and copy it out the the user's buffer. + */ +int +xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa) +{ + iovec_t *iovp; + int reclen, namelen; + xfs_dirent_t *idbp; + uio_t *uio; + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + iovp = uio->uio_iov; + idbp = (xfs_dirent_t *)iovp->iov_base; + iovp->iov_base = (char *)idbp + reclen; + iovp->iov_len -= reclen; + uio->uio_resid -= reclen; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook.o; + idbp->d_name[namelen] = '\0'; + pa->done = 1; + memcpy(idbp->d_name, pa->name, namelen); + return 0; +} + +/* + * Format a dirent64 structure and copy it out the the user's buffer. + */ +int +xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa) +{ + int retval, reclen, namelen; + xfs_dirent_t *idbp; + uio_t *uio; + + namelen = pa->namelen; + reclen = DIRENTSIZE(namelen); + uio = pa->uio; + if (reclen > uio->uio_resid) { + pa->done = 0; + return 0; + } + idbp = pa->dbp; + idbp->d_reclen = reclen; + idbp->d_ino = pa->ino; + idbp->d_off = pa->cook.o; + idbp->d_name[namelen] = '\0'; + memcpy(idbp->d_name, pa->name, namelen); + retval = uio_read((caddr_t)idbp, reclen, uio); + pa->done = (retval == 0); + return retval; +} diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h new file mode 100644 index 00000000000..00d68d33cc7 --- /dev/null +++ b/fs/xfs/xfs_dir_leaf.h @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR_LEAF_H__ +#define __XFS_DIR_LEAF_H__ + +/* + * Directory layout, internal structure, access macros, etc. + * + * Large directories are structured around Btrees where all the data + * elements are in the leaf nodes. Filenames are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of a filename may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + */ + +struct uio; +struct xfs_bmap_free; +struct xfs_dabuf; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_dir_put_args; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/*======================================================================== + * Directory Structure when equal to XFS_LBSIZE(mp) bytes. + *========================================================================*/ + +/* + * This is the structure of the leaf nodes in the Btree. + * + * Struct leaf_entry's are packed from the top. Names grow from the bottom + * but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the namelist area and try again. If we + * still don't have enough space, then we have to split the block. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual string. The root and intermediate node search always takes + * the first-in-the-block key match found, so we should only have to work + * "forw"ard. If none matches, continue with the "forw"ard leaf nodes + * until the hash key changes or the filename is found. + * + * The parent directory and the self-pointer are explicitly represented + * (ie: there are entries for "." and ".."). + * + * Note that the count being a __uint16_t limits us to something like a + * blocksize of 1.3MB in the face of worst case (short) filenames. + */ +#define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_dir_leafblock { + struct xfs_dir_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active leaf_entry's */ + __uint16_t namebytes; /* num bytes of name strings stored */ + __uint16_t firstused; /* first used byte in name area */ + __uint8_t holes; /* != 0 if blk needs compaction */ + __uint8_t pad1; + struct xfs_dir_leaf_map {/* RLE map of free bytes */ + __uint16_t base; /* base of free region */ + __uint16_t size; /* run length of free region */ + } freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */ + } hdr; + struct xfs_dir_leaf_entry { /* sorted on key, not name */ + xfs_dahash_t hashval; /* hash value of name */ + __uint16_t nameidx; /* index into buffer of name */ + __uint8_t namelen; /* length of name string */ + __uint8_t pad2; + } entries[1]; /* var sized array */ + struct xfs_dir_leaf_name { + xfs_dir_ino_t inumber; /* inode number for this key */ + __uint8_t name[1]; /* name string itself */ + } namelist[1]; /* grows from bottom of buf */ +} xfs_dir_leafblock_t; +typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t; +typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t; +typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t; +typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t; + +/* + * Length of name for which a 512-byte block filesystem + * can get a double split. + */ +#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN \ + (512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \ + (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \ + (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1) + +typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa); + +typedef union { + xfs_off_t o; /* offset (cookie) */ + /* + * Watch the order here (endian-ness dependent). + */ + struct { +#if __BYTE_ORDER == __LITTLE_ENDIAN + xfs_dahash_t h; /* hash value */ + __uint32_t be; /* block and entry */ +#else /* __BYTE_ORDER == __BIG_ENDIAN */ + __uint32_t be; /* block and entry */ + xfs_dahash_t h; /* hash value */ +#endif /* __BYTE_ORDER == __BIG_ENDIAN */ + } s; +} xfs_dircook_t; + +#define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \ + ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash)) + +typedef struct xfs_dir_put_args +{ + xfs_dircook_t cook; /* cookie of (next) entry */ + xfs_intino_t ino; /* inode number */ + struct xfs_dirent *dbp; /* buffer pointer */ + char *name; /* directory entry name */ + int namelen; /* length of name */ + int done; /* output: set if value was stored */ + xfs_dir_put_t put; /* put function ptr (i/o) */ + struct uio *uio; /* uio control structure */ +} xfs_dir_put_args_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME) +int xfs_dir_leaf_entsize_byname(int len); +#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) xfs_dir_leaf_entsize_byname(len) +#else +#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) /* space a name will use */ \ + ((uint)sizeof(xfs_dir_leaf_name_t)-1 + len) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY) +int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry); +#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry) \ + xfs_dir_leaf_entsize_byentry(entry) +#else +#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry) /* space an entry will use */ \ + ((uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_NAMESTRUCT) +xfs_dir_leaf_name_t * +xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset); +#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset) \ + xfs_dir_leaf_namestruct(leafp,offset) +#else +#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset) /* point to name struct */ \ + ((xfs_dir_leaf_name_t *)&((char *)(leafp))[offset]) +#endif + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when dirsize < XFS_LITINO(mp). + */ +int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent); +int xfs_dir_shortform_addname(struct xfs_da_args *args); +int xfs_dir_shortform_lookup(struct xfs_da_args *args); +int xfs_dir_shortform_to_leaf(struct xfs_da_args *args); +int xfs_dir_shortform_removename(struct xfs_da_args *args); +int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp, + struct xfs_dirent *dbp, xfs_dir_put_t put); +int xfs_dir_shortform_replace(struct xfs_da_args *args); + +/* + * Internal routines when dirsize == XFS_LBSIZE(mp). + */ +int xfs_dir_leaf_to_node(struct xfs_da_args *args); +int xfs_dir_leaf_to_shortform(struct xfs_da_args *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, + struct xfs_dabuf **bpp); +int xfs_dir_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args, int insertion_index); +int xfs_dir_leaf_addname(struct xfs_da_args *args); +int xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer, + struct xfs_da_args *args, + int *index_found_at); +int xfs_dir_leaf_remove(struct xfs_trans *trans, + struct xfs_dabuf *leaf_buffer, + int index_to_remove); +int xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp, + xfs_dablk_t bno, struct uio *uio, + int *eobp, struct xfs_dirent *dbp, + xfs_dir_put_t put, xfs_daddr_t nextda); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_dir_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); + +/* + * Utility routines. + */ +uint xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count); +int xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp, + struct xfs_dabuf *leaf2_bp); +int xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa); +int xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa); +int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + + +/* + * Global data. + */ +extern xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +#endif /* __XFS_DIR_LEAF_H__ */ diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h new file mode 100644 index 00000000000..a61bcfc2a87 --- /dev/null +++ b/fs/xfs/xfs_dir_sf.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DIR_SF_H__ +#define __XFS_DIR_SF_H__ + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to + * fit into the literal area of the inode. + */ + +typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t; + +/* + * The parent directory has a dedicated field, and the self-pointer must + * be calculated on the fly. + * + * Entries are packed toward the top as tight as possible. The header + * and the elements much be memcpy'd out into a work area to get correct + * alignment for the inode number fields. + */ +typedef struct xfs_dir_shortform { + struct xfs_dir_sf_hdr { /* constant-structure header block */ + xfs_dir_ino_t parent; /* parent dir inode number */ + __uint8_t count; /* count of active entries */ + } hdr; + struct xfs_dir_sf_entry { + xfs_dir_ino_t inumber; /* referenced inode number */ + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t name[1]; /* name */ + } list[1]; /* variable sized array */ +} xfs_dir_shortform_t; +typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t; +typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t; + +/* + * We generate this then sort it, so that readdirs are returned in + * hash-order. Else seekdir won't work. + */ +typedef struct xfs_dir_sf_sort { + __uint8_t entno; /* .=0, ..=1, else entry# + 2 */ + __uint8_t seqno; /* sequence # with same hash value */ + __uint8_t namelen; /* length of name value (no null) */ + xfs_dahash_t hash; /* this entry's hash value */ + xfs_intino_t ino; /* this entry's inode number */ + char *name; /* name value, pointer into buffer */ +} xfs_dir_sf_sort_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_GET_DIRINO) +void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to); +#define XFS_DIR_SF_GET_DIRINO(from,to) xfs_dir_sf_get_dirino(from, to) +#else +#define XFS_DIR_SF_GET_DIRINO(from,to) (*(to) = XFS_GET_DIR_INO8(*from)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_PUT_DIRINO) +void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to); +#define XFS_DIR_SF_PUT_DIRINO(from,to) xfs_dir_sf_put_dirino(from, to) +#else +#define XFS_DIR_SF_PUT_DIRINO(from,to) XFS_PUT_DIR_INO8(*(from), *(to)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME) +int xfs_dir_sf_entsize_byname(int len); +#define XFS_DIR_SF_ENTSIZE_BYNAME(len) xfs_dir_sf_entsize_byname(len) +#else +#define XFS_DIR_SF_ENTSIZE_BYNAME(len) /* space a name uses */ \ + ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (len)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY) +int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep); +#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep) xfs_dir_sf_entsize_byentry(sfep) +#else +#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep) /* space an entry uses */ \ + ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_NEXTENTRY) +xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep); +#define XFS_DIR_SF_NEXTENTRY(sfep) xfs_dir_sf_nextentry(sfep) +#else +#define XFS_DIR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_dir_sf_entry_t *) \ + ((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ALLFIT) +int xfs_dir_sf_allfit(int count, int totallen); +#define XFS_DIR_SF_ALLFIT(count,totallen) \ + xfs_dir_sf_allfit(count,totallen) +#else +#define XFS_DIR_SF_ALLFIT(count,totallen) /* will all entries fit? */ \ + ((uint)sizeof(xfs_dir_sf_hdr_t) + \ + ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen)) +#endif + +#if defined(XFS_DIR_TRACE) + +/* + * Kernel tracing support for directories. + */ +struct uio; +struct xfs_inode; +struct xfs_da_intnode; +struct xfs_dinode; +struct xfs_dir_leafblock; +struct xfs_dir_leaf_entry; + +#define XFS_DIR_TRACE_SIZE 4096 /* size of global trace buffer */ +extern ktrace_t *xfs_dir_trace_buf; + +/* + * Trace record types. + */ +#define XFS_DIR_KTRACE_G_DU 1 /* dp, uio */ +#define XFS_DIR_KTRACE_G_DUB 2 /* dp, uio, bno */ +#define XFS_DIR_KTRACE_G_DUN 3 /* dp, uio, node */ +#define XFS_DIR_KTRACE_G_DUL 4 /* dp, uio, leaf */ +#define XFS_DIR_KTRACE_G_DUE 5 /* dp, uio, leaf entry */ +#define XFS_DIR_KTRACE_G_DUC 6 /* dp, uio, cookie */ + +void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio); +void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio, + xfs_dablk_t bno); +void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio, + struct xfs_da_intnode *node); +void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio, + struct xfs_dir_leafblock *leaf); +void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio, + struct xfs_dir_leaf_entry *entry); +void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio, + xfs_off_t cookie); +void xfs_dir_trace_enter(int type, char *where, + void *a0, void *a1, void *a2, void *a3, + void *a4, void *a5, void *a6, void *a7, + void *a8, void *a9, void *a10, void *a11); +#else +#define xfs_dir_trace_g_du(w,d,u) +#define xfs_dir_trace_g_dub(w,d,u,b) +#define xfs_dir_trace_g_dun(w,d,u,n) +#define xfs_dir_trace_g_dul(w,d,u,l) +#define xfs_dir_trace_g_due(w,d,u,e) +#define xfs_dir_trace_g_duc(w,d,u,c) +#endif /* DEBUG */ + +#endif /* __XFS_DIR_SF_H__ */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h new file mode 100644 index 00000000000..55ae3e67d24 --- /dev/null +++ b/fs/xfs/xfs_dmapi.h @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_DMAPI_H__ +#define __XFS_DMAPI_H__ + +/* Values used to define the on-disk version of dm_attrname_t. All + * on-disk attribute names start with the 8-byte string "SGI_DMI_". + * + * In the on-disk inode, DMAPI attribute names consist of the user-provided + * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be + * changed. + */ + +#define DMATTR_PREFIXLEN 8 +#define DMATTR_PREFIXSTRING "SGI_DMI_" + +typedef enum { + DM_EVENT_INVALID = -1, + DM_EVENT_CANCEL = 0, /* not supported */ + DM_EVENT_MOUNT = 1, + DM_EVENT_PREUNMOUNT = 2, + DM_EVENT_UNMOUNT = 3, + DM_EVENT_DEBUT = 4, /* not supported */ + DM_EVENT_CREATE = 5, + DM_EVENT_CLOSE = 6, /* not supported */ + DM_EVENT_POSTCREATE = 7, + DM_EVENT_REMOVE = 8, + DM_EVENT_POSTREMOVE = 9, + DM_EVENT_RENAME = 10, + DM_EVENT_POSTRENAME = 11, + DM_EVENT_LINK = 12, + DM_EVENT_POSTLINK = 13, + DM_EVENT_SYMLINK = 14, + DM_EVENT_POSTSYMLINK = 15, + DM_EVENT_READ = 16, + DM_EVENT_WRITE = 17, + DM_EVENT_TRUNCATE = 18, + DM_EVENT_ATTRIBUTE = 19, + DM_EVENT_DESTROY = 20, + DM_EVENT_NOSPACE = 21, + DM_EVENT_USER = 22, + DM_EVENT_MAX = 23 +} dm_eventtype_t; +#define HAVE_DM_EVENTTYPE_T + +typedef enum { + DM_RIGHT_NULL, + DM_RIGHT_SHARED, + DM_RIGHT_EXCL +} dm_right_t; +#define HAVE_DM_RIGHT_T + +/* Defines for determining if an event message should be sent. */ +#define DM_EVENT_ENABLED(vfsp, ip, event) ( \ + unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ + ( ((ip)->i_d.di_dmevmask & (1 << event)) || \ + ((ip)->i_mount->m_dmevmask & (1 << event)) ) \ + ) + +#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \ + unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ + ( ((io)->io_dmevmask & (1 << event)) || \ + ((io)->io_mount->m_dmevmask & (1 << event)) ) \ + ) + +#define DM_XFS_VALID_FS_EVENTS ( \ + (1 << DM_EVENT_PREUNMOUNT) | \ + (1 << DM_EVENT_UNMOUNT) | \ + (1 << DM_EVENT_NOSPACE) | \ + (1 << DM_EVENT_DEBUT) | \ + (1 << DM_EVENT_CREATE) | \ + (1 << DM_EVENT_POSTCREATE) | \ + (1 << DM_EVENT_REMOVE) | \ + (1 << DM_EVENT_POSTREMOVE) | \ + (1 << DM_EVENT_RENAME) | \ + (1 << DM_EVENT_POSTRENAME) | \ + (1 << DM_EVENT_LINK) | \ + (1 << DM_EVENT_POSTLINK) | \ + (1 << DM_EVENT_SYMLINK) | \ + (1 << DM_EVENT_POSTSYMLINK) | \ + (1 << DM_EVENT_ATTRIBUTE) | \ + (1 << DM_EVENT_DESTROY) ) + +/* Events valid in dm_set_eventlist() when called with a file handle for + a regular file or a symlink. These events are persistent. +*/ + +#define DM_XFS_VALID_FILE_EVENTS ( \ + (1 << DM_EVENT_ATTRIBUTE) | \ + (1 << DM_EVENT_DESTROY) ) + +/* Events valid in dm_set_eventlist() when called with a file handle for + a directory. These events are persistent. +*/ + +#define DM_XFS_VALID_DIRECTORY_EVENTS ( \ + (1 << DM_EVENT_CREATE) | \ + (1 << DM_EVENT_POSTCREATE) | \ + (1 << DM_EVENT_REMOVE) | \ + (1 << DM_EVENT_POSTREMOVE) | \ + (1 << DM_EVENT_RENAME) | \ + (1 << DM_EVENT_POSTRENAME) | \ + (1 << DM_EVENT_LINK) | \ + (1 << DM_EVENT_POSTLINK) | \ + (1 << DM_EVENT_SYMLINK) | \ + (1 << DM_EVENT_POSTSYMLINK) | \ + (1 << DM_EVENT_ATTRIBUTE) | \ + (1 << DM_EVENT_DESTROY) ) + +/* Events supported by the XFS filesystem. */ +#define DM_XFS_SUPPORTED_EVENTS ( \ + (1 << DM_EVENT_MOUNT) | \ + (1 << DM_EVENT_PREUNMOUNT) | \ + (1 << DM_EVENT_UNMOUNT) | \ + (1 << DM_EVENT_NOSPACE) | \ + (1 << DM_EVENT_CREATE) | \ + (1 << DM_EVENT_POSTCREATE) | \ + (1 << DM_EVENT_REMOVE) | \ + (1 << DM_EVENT_POSTREMOVE) | \ + (1 << DM_EVENT_RENAME) | \ + (1 << DM_EVENT_POSTRENAME) | \ + (1 << DM_EVENT_LINK) | \ + (1 << DM_EVENT_POSTLINK) | \ + (1 << DM_EVENT_SYMLINK) | \ + (1 << DM_EVENT_POSTSYMLINK) | \ + (1 << DM_EVENT_READ) | \ + (1 << DM_EVENT_WRITE) | \ + (1 << DM_EVENT_TRUNCATE) | \ + (1 << DM_EVENT_ATTRIBUTE) | \ + (1 << DM_EVENT_DESTROY) ) + + +/* + * Definitions used for the flags field on dm_send_*_event(). + */ + +#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ +#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ +#define DM_FLAGS_ISEM 0x004 /* thread holds i_sem */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,21) +/* i_alloc_sem was added in 2.4.22-pre1 */ +#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ +#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ +#endif +#endif + +/* + * Based on IO_ISDIRECT, decide which i_ flag is set. + */ +#ifdef DM_FLAGS_IALLOCSEM_RD +#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ + DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM) +#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) +#else +#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ + 0 : DM_FLAGS_ISEM) +#define DM_SEM_FLAG_WR (DM_FLAGS_ISEM) +#endif + +/* + * Macros to turn caller specified delay/block flags into + * dm_send_xxxx_event flag DM_FLAGS_NDELAY. + */ + +#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ + DM_FLAGS_NDELAY : 0) +#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) + + +extern struct bhv_vfsops xfs_dmops; + +#ifdef CONFIG_XFS_DMAPI +void xfs_dm_init(struct file_system_type *); +void xfs_dm_exit(struct file_system_type *); +#define XFS_DM_INIT(fstype) xfs_dm_init(fstype) +#define XFS_DM_EXIT(fstype) xfs_dm_exit(fstype) +#else +#define XFS_DM_INIT(fstype) +#define XFS_DM_EXIT(fstype) +#endif + +#endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c new file mode 100644 index 00000000000..cec54ba800e --- /dev/null +++ b/fs/xfs/xfs_dmops.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + +xfs_dmops_t xfs_dmcore_stub = { + .xfs_send_data = (xfs_send_data_t)fs_nosys, + .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr, + .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys, + .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys, + .xfs_send_unmount = (xfs_send_unmount_t)fs_noval, +}; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c new file mode 100644 index 00000000000..bbe1dea11c0 --- /dev/null +++ b/fs/xfs/xfs_error.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_sb.h" +#include "xfs_trans.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_utils.h" +#include "xfs_error.h" + +#ifdef DEBUG + +int xfs_etrap[XFS_ERROR_NTRAP] = { + 0, +}; + +int +xfs_error_trap(int e) +{ + int i; + + if (!e) + return 0; + for (i = 0; i < XFS_ERROR_NTRAP; i++) { + if (xfs_etrap[i] == 0) + break; + if (e != xfs_etrap[i]) + continue; + cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); + debug_stop_all_cpus((void *)-1LL); + BUG(); + break; + } + return e; +} +#endif + +#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) + +int xfs_etest[XFS_NUM_INJECT_ERROR]; +int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; +char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; + +void +xfs_error_test_init(void) +{ + memset(xfs_etest, 0, sizeof(xfs_etest)); + memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid)); + memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname)); +} + +int +xfs_error_test(int error_tag, int *fsidp, char *expression, + int line, char *file, unsigned long randfactor) +{ + int i; + int64_t fsid; + + if (random() % randfactor) + return 0; + + memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { + cmn_err(CE_WARN, + "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, xfs_etest_fsname[i]); + return 1; + } + } + + return 0; +} + +int +xfs_errortag_add(int error_tag, xfs_mount_t *mp) +{ + int i; + int len; + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { + cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); + return 0; + } + } + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == 0) { + cmn_err(CE_WARN, "Turned on XFS error tag #%d", + error_tag); + xfs_etest[i] = error_tag; + xfs_etest_fsid[i] = fsid; + len = strlen(mp->m_fsname); + xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); + strcpy(xfs_etest_fsname[i], mp->m_fsname); + return 0; + } + } + + cmn_err(CE_WARN, "error tag overflow, too many turned on"); + + return 1; +} + +int +xfs_errortag_clear(int error_tag, xfs_mount_t *mp) +{ + int i; + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { + xfs_etest[i] = 0; + xfs_etest_fsid[i] = 0LL; + kmem_free(xfs_etest_fsname[i], + strlen(xfs_etest_fsname[i]) + 1); + xfs_etest_fsname[i] = NULL; + cmn_err(CE_WARN, "Cleared XFS error tag #%d", + error_tag); + return 0; + } + } + + cmn_err(CE_WARN, "XFS error tag %d not on", error_tag); + + return 1; +} + +int +xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) +{ + int i; + int cleared = 0; + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && + xfs_etest[i] != 0) { + cleared = 1; + cmn_err(CE_WARN, "Clearing XFS error tag #%d", + xfs_etest[i]); + xfs_etest[i] = 0; + xfs_etest_fsid[i] = 0LL; + kmem_free(xfs_etest_fsname[i], + strlen(xfs_etest_fsname[i]) + 1); + xfs_etest_fsname[i] = NULL; + } + } + + if (loud || cleared) + cmn_err(CE_WARN, + "Cleared all XFS error tags for filesystem \"%s\"", + fsname); + + return 0; +} + +int +xfs_errortag_clearall(xfs_mount_t *mp) +{ + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1); +} +#endif /* DEBUG || INDUCE_IO_ERROR */ + +static void +xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) +{ + if (mp != NULL) { + char *newfmt; + int len = 16 + mp->m_fsname_len + strlen(fmt); + + newfmt = kmem_alloc(len, KM_SLEEP); + sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt); + icmn_err(level, newfmt, ap); + kmem_free(newfmt, len); + } else { + icmn_err(level, fmt, ap); + } +} + +void +xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xfs_fs_vcmn_err(level, mp, fmt, ap); + va_end(ap); +} + +void +xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) +{ + va_list ap; + +#ifdef DEBUG + xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; +#endif + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag) + && (level & CE_ALERT)) { + level &= ~CE_ALERT; + level |= CE_PANIC; + cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); + } + va_start(ap, fmt); + xfs_fs_vcmn_err(level, mp, fmt, ap); + va_end(ap); +} + +void +xfs_error_report( + char *tag, + int level, + xfs_mount_t *mp, + char *fname, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) { + xfs_cmn_err(XFS_PTAG_ERROR_REPORT, + CE_ALERT, mp, + "XFS internal error %s at line %d of file %s. Caller 0x%p\n", + tag, linenum, fname, ra); + + xfs_stack_trace(); + } +} + +void +xfs_hex_dump(void *p, int length) +{ + __uint8_t *uip = (__uint8_t*)p; + int i; + char sbuf[128], *s; + + s = sbuf; + *s = '\0'; + for (i=0; im_fixedfsid, #expr, __LINE__, __FILE__, \ + (rf))) +#else +#define XFS_TEST_ERROR(expr, mp, tag, rf) \ + ((expr) || \ + xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ + (rf))) +#endif /* __ANSI_CPP__ */ + +int xfs_errortag_add(int error_tag, xfs_mount_t *mp); +int xfs_errortag_clear(int error_tag, xfs_mount_t *mp); + +int xfs_errortag_clearall(xfs_mount_t *mp); +int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, + int loud); +#else +#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) +#define xfs_errortag_add(tag, mp) (ENOSYS) +#define xfs_errortag_clearall(mp) (ENOSYS) +#endif /* (DEBUG || INDUCE_IO_ERROR) */ + +/* + * XFS panic tags -- allow a call to xfs_cmn_err() be turned into + * a panic by setting xfs_panic_mask in a + * sysctl. update xfs_max[XFS_PARAM] if + * more are added. + */ +#define XFS_NO_PTAG 0 +#define XFS_PTAG_IFLUSH 0x00000001 +#define XFS_PTAG_LOGRES 0x00000002 +#define XFS_PTAG_AILDELETE 0x00000004 +#define XFS_PTAG_ERROR_REPORT 0x00000008 +#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 +#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 +#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 + +struct xfs_mount; +/* PRINTFLIKE4 */ +void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, + char *fmt, ...); +/* PRINTFLIKE3 */ +void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); + +#endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c new file mode 100644 index 00000000000..5eafd5b6321 --- /dev/null +++ b/fs/xfs/xfs_extfree_item.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains the implementation of the xfs_efi_log_item + * and xfs_efd_log_item items. + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + + +kmem_zone_t *xfs_efi_zone; +kmem_zone_t *xfs_efd_zone; + +STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); +STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *); +STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *); + + + +/* + * This returns the number of iovecs needed to log the given efi item. + * We only need 1 iovec for an efi item. It just logs the efi_log_format + * structure. + */ +/*ARGSUSED*/ +STATIC uint +xfs_efi_item_size(xfs_efi_log_item_t *efip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efi log item. We use only 1 iovec, and we point that + * at the efi_log_format structure embedded in the efi item. + * It is at this point that we assert that all of the extent + * slots in the efi item have been filled. + */ +STATIC void +xfs_efi_item_format(xfs_efi_log_item_t *efip, + xfs_log_iovec_t *log_vector) +{ + uint size; + + ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); + + efip->efi_format.efi_type = XFS_LI_EFI; + + size = sizeof(xfs_efi_log_format_t); + size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); + efip->efi_format.efi_size = 1; + + log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); + log_vector->i_len = size; + ASSERT(size >= sizeof(xfs_efi_log_format_t)); +} + + +/* + * Pinning has no meaning for an efi item, so just return. + */ +/*ARGSUSED*/ +STATIC void +xfs_efi_item_pin(xfs_efi_log_item_t *efip) +{ + return; +} + + +/* + * While EFIs cannot really be pinned, the unpin operation is the + * last place at which the EFI is manipulated during a transaction. + * Here we coordinate with xfs_efi_cancel() to determine who gets to + * free the EFI. + */ +/*ARGSUSED*/ +STATIC void +xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +{ + int nexts; + int size; + xfs_mount_t *mp; + SPLDECL(s); + + mp = efip->efi_item.li_mountp; + AIL_LOCK(mp, s); + if (efip->efi_flags & XFS_EFI_CANCELED) { + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efi_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efip, size); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + } else { + efip->efi_flags |= XFS_EFI_COMMITTED; + AIL_UNLOCK(mp, s); + } + + return; +} + +/* + * like unpin only we have to also clear the xaction descriptor + * pointing the log item if we free the item. This routine duplicates + * unpin because efi_flags is protected by the AIL lock. Freeing + * the descriptor and then calling unpin would force us to drop the AIL + * lock which would open up a race condition. + */ +STATIC void +xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) +{ + int nexts; + int size; + xfs_mount_t *mp; + xfs_log_item_desc_t *lidp; + SPLDECL(s); + + mp = efip->efi_item.li_mountp; + AIL_LOCK(mp, s); + if (efip->efi_flags & XFS_EFI_CANCELED) { + /* + * free the xaction descriptor pointing to this item + */ + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); + xfs_trans_free_item(tp, lidp); + /* + * pull the item off the AIL. + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + /* + * now free the item itself + */ + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efi_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efip, size); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + } else { + efip->efi_flags |= XFS_EFI_COMMITTED; + AIL_UNLOCK(mp, s); + } + + return; +} + +/* + * Efi items have no locking or pushing. However, since EFIs are + * pulled from the AIL when their corresponding EFDs are committed + * to disk, their situation is very similar to being pinned. Return + * XFS_ITEM_PINNED so that the caller will eventually flush the log. + * This should help in getting the EFI out of the AIL. + */ +/*ARGSUSED*/ +STATIC uint +xfs_efi_item_trylock(xfs_efi_log_item_t *efip) +{ + return XFS_ITEM_PINNED; +} + +/* + * Efi items have no locking, so just return. + */ +/*ARGSUSED*/ +STATIC void +xfs_efi_item_unlock(xfs_efi_log_item_t *efip) +{ + if (efip->efi_item.li_flags & XFS_LI_ABORTED) + xfs_efi_item_abort(efip); + return; +} + +/* + * The EFI is logged only once and cannot be moved in the log, so + * simply return the lsn at which it's been logged. The canceled + * flag is not paid any attention here. Checking for that is delayed + * until the EFI is unpinned. + */ +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * This is called when the transaction logging the EFI is aborted. + * Free up the EFI and return. No need to clean up the slot for + * the item in the transaction. That was done by the unpin code + * which is called prior to this routine in the abort/fs-shutdown path. + */ +STATIC void +xfs_efi_item_abort(xfs_efi_log_item_t *efip) +{ + int nexts; + int size; + + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efi_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efip, size); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + return; +} + +/* + * There isn't much you can do to push on an efi item. It is simply + * stuck waiting for all of its corresponding efd items to be + * committed to disk. + */ +/*ARGSUSED*/ +STATIC void +xfs_efi_item_push(xfs_efi_log_item_t *efip) +{ + return; +} + +/* + * The EFI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +/*ARGSUSED*/ +STATIC void +xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +{ + return; +} + +/* + * This is the ops vector shared by all efi log items. + */ +struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_efi_item_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) + xfs_efi_item_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_efi_item_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_efi_item_committing +}; + + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +xfs_efi_log_item_t * +xfs_efi_init(xfs_mount_t *mp, + uint nextents) + +{ + xfs_efi_log_item_t *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efi_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); + } else { + efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, + KM_SLEEP); + } + + efip->efi_item.li_type = XFS_LI_EFI; + efip->efi_item.li_ops = &xfs_efi_item_ops; + efip->efi_item.li_mountp = mp; + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (__psint_t)(void*)efip; + + return (efip); +} + +/* + * This is called by the efd item code below to release references to + * the given efi item. Each efd calls this with the number of + * extents that it has logged, and when the sum of these reaches + * the total number of extents logged by this efi item we can free + * the efi item. + * + * Freeing the efi item requires that we remove it from the AIL. + * We'll use the AIL lock to protect our counters as well as + * the removal from the AIL. + */ +void +xfs_efi_release(xfs_efi_log_item_t *efip, + uint nextents) +{ + xfs_mount_t *mp; + int extents_left; + uint size; + int nexts; + SPLDECL(s); + + mp = efip->efi_item.li_mountp; + ASSERT(efip->efi_next_extent > 0); + ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); + + AIL_LOCK(mp, s); + ASSERT(efip->efi_next_extent >= nextents); + efip->efi_next_extent -= nextents; + extents_left = efip->efi_next_extent; + if (extents_left == 0) { + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + } else { + AIL_UNLOCK(mp, s); + } + + if (extents_left == 0) { + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efi_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efip, size); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + } +} + +/* + * This is called when the transaction that should be committing the + * EFD corresponding to the given EFI is aborted. The committed and + * canceled flags are used to coordinate the freeing of the EFI and + * the references by the transaction that committed it. + */ +STATIC void +xfs_efi_cancel( + xfs_efi_log_item_t *efip) +{ + int nexts; + int size; + xfs_mount_t *mp; + SPLDECL(s); + + mp = efip->efi_item.li_mountp; + AIL_LOCK(mp, s); + if (efip->efi_flags & XFS_EFI_COMMITTED) { + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efi_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efip, size); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + } else { + efip->efi_flags |= XFS_EFI_CANCELED; + AIL_UNLOCK(mp, s); + } + + return; +} + + + + + +/* + * This returns the number of iovecs needed to log the given efd item. + * We only need 1 iovec for an efd item. It just logs the efd_log_format + * structure. + */ +/*ARGSUSED*/ +STATIC uint +xfs_efd_item_size(xfs_efd_log_item_t *efdp) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efd log item. We use only 1 iovec, and we point that + * at the efd_log_format structure embedded in the efd item. + * It is at this point that we assert that all of the extent + * slots in the efd item have been filled. + */ +STATIC void +xfs_efd_item_format(xfs_efd_log_item_t *efdp, + xfs_log_iovec_t *log_vector) +{ + uint size; + + ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + + efdp->efd_format.efd_type = XFS_LI_EFD; + + size = sizeof(xfs_efd_log_format_t); + size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); + efdp->efd_format.efd_size = 1; + + log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); + log_vector->i_len = size; + ASSERT(size >= sizeof(xfs_efd_log_format_t)); +} + + +/* + * Pinning has no meaning for an efd item, so just return. + */ +/*ARGSUSED*/ +STATIC void +xfs_efd_item_pin(xfs_efd_log_item_t *efdp) +{ + return; +} + + +/* + * Since pinning has no meaning for an efd item, unpinning does + * not either. + */ +/*ARGSUSED*/ +STATIC void +xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +{ + return; +} + +/*ARGSUSED*/ +STATIC void +xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) +{ + return; +} + +/* + * Efd items have no locking, so just return success. + */ +/*ARGSUSED*/ +STATIC uint +xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Efd items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +/*ARGSUSED*/ +STATIC void +xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) +{ + if (efdp->efd_item.li_flags & XFS_LI_ABORTED) + xfs_efd_item_abort(efdp); + return; +} + +/* + * When the efd item is committed to disk, all we need to do + * is delete our reference to our partner efi item and then + * free ourselves. Since we're freeing ourselves we must + * return -1 to keep the transaction code from further referencing + * this item. + */ +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) +{ + uint size; + int nexts; + + /* + * If we got a log I/O error, it's always the case that the LR with the + * EFI got unpinned and freed before the EFD got aborted. + */ + if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) + xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + + nexts = efdp->efd_format.efd_nextents; + if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efd_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efdp, size); + } else { + kmem_zone_free(xfs_efd_zone, efdp); + } + + return (xfs_lsn_t)-1; +} + +/* + * The transaction of which this EFD is a part has been aborted. + * Inform its companion EFI of this fact and then clean up after + * ourselves. No need to clean up the slot for the item in the + * transaction. That was done by the unpin code which is called + * prior to this routine in the abort/fs-shutdown path. + */ +STATIC void +xfs_efd_item_abort(xfs_efd_log_item_t *efdp) +{ + int nexts; + int size; + + /* + * If we got a log I/O error, it's always the case that the LR with the + * EFI got unpinned and freed before the EFD got aborted. So don't + * reference the EFI at all in that case. + */ + if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) + xfs_efi_cancel(efdp->efd_efip); + + nexts = efdp->efd_format.efd_nextents; + if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { + size = sizeof(xfs_efd_log_item_t); + size += (nexts - 1) * sizeof(xfs_extent_t); + kmem_free(efdp, size); + } else { + kmem_zone_free(xfs_efd_zone, efdp); + } + return; +} + +/* + * There isn't much you can do to push on an efd item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +/*ARGSUSED*/ +STATIC void +xfs_efd_item_push(xfs_efd_log_item_t *efdp) +{ + return; +} + +/* + * The EFD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +/*ARGSUSED*/ +STATIC void +xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) +{ + return; +} + +/* + * This is the ops vector shared by all efd log items. + */ +struct xfs_item_ops xfs_efd_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_efd_item_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) + xfs_efd_item_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_efd_item_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort, + .iop_pushbuf = NULL, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_efd_item_committing +}; + + +/* + * Allocate and initialize an efd item with the given number of extents. + */ +xfs_efd_log_item_t * +xfs_efd_init(xfs_mount_t *mp, + xfs_efi_log_item_t *efip, + uint nextents) + +{ + xfs_efd_log_item_t *efdp; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efd_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); + } else { + efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, + KM_SLEEP); + } + + efdp->efd_item.li_type = XFS_LI_EFD; + efdp->efd_item.li_ops = &xfs_efd_item_ops; + efdp->efd_item.li_mountp = mp; + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = nextents; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return (efdp); +} diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h new file mode 100644 index 00000000000..7122d6101d1 --- /dev/null +++ b/fs/xfs/xfs_extfree_item.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_EXTFREE_ITEM_H__ +#define __XFS_EXTFREE_ITEM_H__ + +struct xfs_mount; +struct kmem_zone; + +typedef struct xfs_extent { + xfs_dfsbno_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + unsigned short efi_type; /* efi log item type */ + unsigned short efi_size; /* size of this item */ + uint efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + unsigned short efd_type; /* efd log item type */ + unsigned short efd_size; /* size of this item */ + uint efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + + +#ifdef __KERNEL__ + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFI_MAX_FAST_EXTENTS 16 + +/* + * Define EFI flags. + */ +#define XFS_EFI_RECOVERED 0x1 +#define XFS_EFI_COMMITTED 0x2 +#define XFS_EFI_CANCELED 0x4 + +/* + * This is the "extent free intention" log item. It is used + * to log the fact that some extents need to be free. It is + * used in conjunction with the "extent free done" log item + * described below. + */ +typedef struct xfs_efi_log_item { + xfs_log_item_t efi_item; + uint efi_flags; /* misc flags */ + uint efi_next_extent; + xfs_efi_log_format_t efi_format; +} xfs_efi_log_item_t; + +/* + * This is the "extent free done" log item. It is used to log + * the fact that some extents earlier mentioned in an efi item + * have been freed. + */ +typedef struct xfs_efd_log_item { + xfs_log_item_t efd_item; + xfs_efi_log_item_t *efd_efip; + uint efd_next_extent; + xfs_efd_log_format_t efd_format; +} xfs_efd_log_item_t; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFD_MAX_FAST_EXTENTS 16 + +extern struct kmem_zone *xfs_efi_zone; +extern struct kmem_zone *xfs_efd_zone; + +xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, + uint); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h new file mode 100644 index 00000000000..6ee8443bf9d --- /dev/null +++ b/fs/xfs/xfs_fs.h @@ -0,0 +1,527 @@ +/* + * Copyright (c) 1995-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, + * USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_FS_H__ +#define __XFS_FS_H__ + +/* + * SGI's XFS filesystem's major stuff (constants, structures) + */ + +#define XFS_NAME "xfs" + +/* + * Direct I/O attribute record used with XFS_IOC_DIOINFO + * d_miniosz is the min xfer size, xfer size multiple and file seek offset + * alignment. + */ +#ifndef HAVE_DIOATTR +struct dioattr { + __u32 d_mem; /* data buffer memory alignment */ + __u32 d_miniosz; /* min xfer size */ + __u32 d_maxiosz; /* max xfer size */ +}; +#endif + +/* + * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. + */ +#ifndef HAVE_FSXATTR +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + unsigned char fsx_pad[16]; +}; +#endif + +/* + * Flags for the bs_xflags/fsx_xflags field + * There should be a one-to-one correspondence between these flags and the + * XFS_DIFLAG_s. + */ +#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + +/* + * Structure for XFS_IOC_GETBMAP. + * On input, fill in bmv_offset and bmv_length of the first structure + * to indicate the area of interest in the file, and bmv_entry with the + * number of array elements given. The first structure is updated on + * return to give the offset and length for the next call. + */ +#ifndef HAVE_GETBMAP +struct getbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output) */ +}; +#endif + +/* + * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries + * are used exactly as in the getbmap structure. The getbmapx structure + * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field + * is only used for the first structure. It contains input flags + * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled + * in by the XFS_IOC_GETBMAPX command for each returned structure after + * the first. + */ +#ifndef HAVE_GETBMAPX +struct getbmapx { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output). */ + __s32 bmv_iflags; /* input flags (1st structure) */ + __s32 bmv_oflags; /* output flags (after 1st structure)*/ + __s32 bmv_unused1; /* future use */ + __s32 bmv_unused2; /* future use */ +}; +#endif + +/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ +#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ +#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC) +#ifdef __KERNEL__ +#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */ +#endif + +/* bmv_oflags values - returned for for each non-header segment */ +#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ + +/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */ +#define GETBMAP_CONVERT(p1,p2) { \ + p2.bmv_offset = p1.bmv_offset; \ + p2.bmv_block = p1.bmv_block; \ + p2.bmv_length = p1.bmv_length; \ + p2.bmv_count = p1.bmv_count; \ + p2.bmv_entries = p1.bmv_entries; } + + +/* + * Structure for XFS_IOC_FSSETDM. + * For use by backup and restore programs to set the XFS on-disk inode + * fields di_dmevmask and di_dmstate. These must be set to exactly and + * only values previously obtained via xfs_bulkstat! (Specifically the + * xfs_bstat_t fields bs_dmevmask and bs_dmstate.) + */ +#ifndef HAVE_FSDMIDATA +struct fsdmidata { + __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ + __u16 fsd_padding; + __u16 fsd_dmstate; /* corresponds to di_dmstate */ +}; +#endif + +/* + * File segment locking set data type for 64 bit access. + * Also used for all the RESV/FREE interfaces. + */ +typedef struct xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} xfs_flock64_t; + +/* + * Output for XFS_IOC_FSGEOMETRY_V1 + */ +typedef struct xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} xfs_fsop_geom_v1_t; + +/* + * Output for XFS_IOC_FSGEOMETRY + */ +typedef struct xfs_fsop_geom { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +} xfs_fsop_geom_t; + +/* Output for XFS_FS_COUNTS */ +typedef struct xfs_fsop_counts { + __u64 freedata; /* free data section blocks */ + __u64 freertx; /* free rt extents */ + __u64 freeino; /* free inodes */ + __u64 allocino; /* total allocated inodes */ +} xfs_fsop_counts_t; + +/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */ +typedef struct xfs_fsop_resblks { + __u64 resblks; + __u64 resblks_avail; +} xfs_fsop_resblks_t; + +#define XFS_FSOP_GEOM_VERSION 0 + +#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ + + +/* + * Minimum and maximum sizes need for growth checks + */ +#define XFS_MIN_AG_BLOCKS 64 +#define XFS_MIN_LOG_BLOCKS 512 +#define XFS_MAX_LOG_BLOCKS (64 * 1024) +#define XFS_MIN_LOG_BYTES (256 * 1024) +#define XFS_MAX_LOG_BYTES (128 * 1024 * 1024) + +/* + * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT + */ +typedef struct xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} xfs_growfs_data_t; + +typedef struct xfs_growfs_log { + __u32 newblocks; /* new log size, fsblocks */ + __u32 isint; /* 1 if new log is internal */ +} xfs_growfs_log_t; + +typedef struct xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} xfs_growfs_rt_t; + + +/* + * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE + */ +typedef struct xfs_bstime { + time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} xfs_bstime_t; + +typedef struct xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + xfs_bstime_t bs_atime; /* access time */ + xfs_bstime_t bs_mtime; /* modify time */ + xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid; /* project id */ + unsigned char bs_pad[14]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} xfs_bstat_t; + +/* + * The user-level BulkStat Request interface structure. + */ +typedef struct xfs_fsop_bulkreq { + __u64 __user *lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + void __user *ubuffer;/* user buffer for inode desc. */ + __s32 __user *ocount; /* output count pointer */ +} xfs_fsop_bulkreq_t; + + +/* + * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS). + */ +typedef struct xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} xfs_inogrp_t; + + +/* + * Error injection. + */ +typedef struct xfs_error_injection { + __s32 fd; + __s32 errtag; +} xfs_error_injection_t; + + +/* + * The user-level Handle Request interface structure. + */ +typedef struct xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + void __user *path; /* user pathname */ + __u32 oflags; /* open flags */ + void __user *ihandle;/* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + void __user *ohandle;/* user buffer for handle */ + __u32 __user *ohandlen;/* user buffer length */ +} xfs_fsop_handlereq_t; + +/* + * Compound structures for passing args through Handle Request interfaces + * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and + * XFS_IOC_ATTRMULTI_BY_HANDLE + */ + +typedef struct xfs_fsop_setdm_handlereq { + struct xfs_fsop_handlereq hreq; /* handle information */ + struct fsdmidata __user *data; /* DMAPI data */ +} xfs_fsop_setdm_handlereq_t; + +typedef struct xfs_attrlist_cursor { + __u32 opaque[4]; +} xfs_attrlist_cursor_t; + +typedef struct xfs_fsop_attrlist_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + void __user *buffer; /* returned names */ +} xfs_fsop_attrlist_handlereq_t; + +typedef struct xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + void __user *am_attrname; + void __user *am_attrvalue; + __u32 am_length; + __u32 am_flags; +} xfs_attr_multiop_t; + +typedef struct xfs_fsop_attrmulti_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + struct xfs_attr_multiop __user *ops; /* attr_multi data */ +} xfs_fsop_attrmulti_handlereq_t; + +/* + * per machine unique filesystem identifier types. + */ +typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ + + +#ifndef HAVE_FID +#define MAXFIDSZ 46 + +typedef struct fid { + __u16 fid_len; /* length of data in bytes */ + unsigned char fid_data[MAXFIDSZ]; /* data (fid_len worth) */ +} fid_t; +#endif + +typedef struct xfs_fid { + __u16 xfs_fid_len; /* length of remainder */ + __u16 xfs_fid_pad; + __u32 xfs_fid_gen; /* generation number */ + __u64 xfs_fid_ino; /* 64 bits inode number */ +} xfs_fid_t; + +typedef struct xfs_fid2 { + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; /* padding, must be zero */ + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* inode number */ +} xfs_fid2_t; + +typedef struct xfs_handle { + union { + __s64 align; /* force alignment of ha_fid */ + xfs_fsid_t _ha_fsid; /* unique file system identifier */ + } ha_u; + xfs_fid_t ha_fid; /* file system specific file ID */ +} xfs_handle_t; +#define ha_fsid ha_u._ha_fsid + +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.xfs_fid_pad \ + - (char *) &(handle)) \ + + (handle).ha_fid.xfs_fid_len) + +#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t)) + +#define FSHSIZE sizeof(fsid_t) + +/* + * Flags for going down operation + */ +#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * ioctl commands that are used by Linux filesystems + */ +#define XFS_IOC_GETXFLAGS _IOR('f', 1, long) +#define XFS_IOC_SETXFLAGS _IOW('f', 2, long) +#define XFS_IOC_GETVERSION _IOR('v', 1, long) + +/* + * ioctl commands that replace IRIX fcntl()'s + * For 'documentation' purposed more than anything else, + * the "cmd #" field reflects the IRIX fcntl number. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) +#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) +#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) +#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) +#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64) +#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap) +#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr) +/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ +/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ +#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) + +/* + * ioctl commands that replace IRIX syssgi()'s + */ +#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1) +#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq) +#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq) +#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext) +#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data) +#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log) +#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt) +#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts) +#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks) +#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks) +#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) +#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) +/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ +#define XFS_IOC_FREEZE _IOWR('X', 119, int) +#define XFS_IOC_THAW _IOWR('X', 120, int) +#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) +#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +/* XFS_IOC_GETFSUUID ---------- deprecated 140 */ + + +#ifndef HAVE_BBMACROS +/* + * Block I/O parameterization. A basic block (BB) is the lowest size of + * filesystem allocation, and must equal 512. Length units given to bio + * routines are in BB's. + */ +#define BBSHIFT 9 +#define BBSIZE (1<> BBSHIFT) +#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT) +#define BBTOB(bbs) ((bbs) << BBSHIFT) +#endif + +#endif /* __XFS_FS_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c new file mode 100644 index 00000000000..21213057c27 --- /dev/null +++ b/fs/xfs/xfs_fsops.c @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_fsops.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_refcache.h" +#include "xfs_trans_space.h" +#include "xfs_rtalloc.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" + +/* + * File system operations + */ + +int +xfs_fs_geometry( + xfs_mount_t *mp, + xfs_fsop_geom_t *geo, + int new_version) +{ + geo->blocksize = mp->m_sb.sb_blocksize; + geo->rtextsize = mp->m_sb.sb_rextsize; + geo->agblocks = mp->m_sb.sb_agblocks; + geo->agcount = mp->m_sb.sb_agcount; + geo->logblocks = mp->m_sb.sb_logblocks; + geo->sectsize = mp->m_sb.sb_sectsize; + geo->inodesize = mp->m_sb.sb_inodesize; + geo->imaxpct = mp->m_sb.sb_imax_pct; + geo->datablocks = mp->m_sb.sb_dblocks; + geo->rtblocks = mp->m_sb.sb_rblocks; + geo->rtextents = mp->m_sb.sb_rextents; + geo->logstart = mp->m_sb.sb_logstart; + ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); + memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); + if (new_version >= 2) { + geo->sunit = mp->m_sb.sb_unit; + geo->swidth = mp->m_sb.sb_width; + } + if (new_version >= 3) { + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = + (XFS_SB_VERSION_HASATTR(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR : 0) | + (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_NLINK : 0) | + (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | + (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | + (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | + (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SHARED : 0) | + (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | + (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | + (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SECTOR : 0); + geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + mp->m_sb.sb_logsectsize : BBSIZE; + geo->rtsectsize = mp->m_sb.sb_blocksize; + geo->dirblocksize = mp->m_dirblksize; + } + if (new_version >= 4) { + geo->flags |= + (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); + geo->logsunit = mp->m_sb.sb_logsunit; + } + return 0; +} + +static int +xfs_growfs_data_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_data_t *in) /* growfs data input struct */ +{ + xfs_agf_t *agf; + xfs_agi_t *agi; + xfs_agnumber_t agno; + xfs_extlen_t agsize; + xfs_extlen_t tmpsize; + xfs_alloc_rec_t *arec; + xfs_btree_sblock_t *block; + xfs_buf_t *bp; + int bucket; + int dpct; + int error; + xfs_agnumber_t nagcount; + xfs_agnumber_t nagimax = 0; + xfs_rfsblock_t nb, nb_mod; + xfs_rfsblock_t new; + xfs_rfsblock_t nfree; + xfs_agnumber_t oagcount; + int pct; + xfs_sb_t *sbp; + xfs_trans_t *tp; + + nb = in->newblocks; + pct = in->imaxpct; + if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + return XFS_ERROR(EINVAL); + dpct = pct - mp->m_sb.sb_imax_pct; + error = xfs_read_buf(mp, mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; + ASSERT(bp); + xfs_buf_relse(bp); + + new = nb; /* use new as a temporary here */ + nb_mod = do_div(new, mp->m_sb.sb_agblocks); + nagcount = new + (nb_mod != 0); + if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { + nagcount--; + nb = nagcount * mp->m_sb.sb_agblocks; + if (nb < mp->m_sb.sb_dblocks) + return XFS_ERROR(EINVAL); + } + new = nb - mp->m_sb.sb_dblocks; + oagcount = mp->m_sb.sb_agcount; + if (nagcount > oagcount) { + down_write(&mp->m_peraglock); + mp->m_perag = kmem_realloc(mp->m_perag, + sizeof(xfs_perag_t) * nagcount, + sizeof(xfs_perag_t) * oagcount, + KM_SLEEP); + memset(&mp->m_perag[oagcount], 0, + (nagcount - oagcount) * sizeof(xfs_perag_t)); + mp->m_flags |= XFS_MOUNT_32BITINODES; + nagimax = xfs_initialize_perag(mp, nagcount); + up_write(&mp->m_peraglock); + } + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), + XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { + xfs_trans_cancel(tp, 0); + return error; + } + + nfree = 0; + for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + /* + * AG freelist header block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + agf = XFS_BUF_TO_AGF(bp); + memset(agf, 0, mp->m_sb.sb_sectsize); + INT_SET(agf->agf_magicnum, ARCH_CONVERT, XFS_AGF_MAGIC); + INT_SET(agf->agf_versionnum, ARCH_CONVERT, XFS_AGF_VERSION); + INT_SET(agf->agf_seqno, ARCH_CONVERT, agno); + if (agno == nagcount - 1) + agsize = + nb - + (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + agsize = mp->m_sb.sb_agblocks; + INT_SET(agf->agf_length, ARCH_CONVERT, agsize); + INT_SET(agf->agf_roots[XFS_BTNUM_BNOi], ARCH_CONVERT, + XFS_BNO_BLOCK(mp)); + INT_SET(agf->agf_roots[XFS_BTNUM_CNTi], ARCH_CONVERT, + XFS_CNT_BLOCK(mp)); + INT_SET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT, 1); + INT_SET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT, 1); + agf->agf_flfirst = 0; + INT_SET(agf->agf_fllast, ARCH_CONVERT, XFS_AGFL_SIZE(mp) - 1); + agf->agf_flcount = 0; + tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + INT_SET(agf->agf_freeblks, ARCH_CONVERT, tmpsize); + INT_SET(agf->agf_longest, ARCH_CONVERT, tmpsize); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * AG inode header block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + agi = XFS_BUF_TO_AGI(bp); + memset(agi, 0, mp->m_sb.sb_sectsize); + INT_SET(agi->agi_magicnum, ARCH_CONVERT, XFS_AGI_MAGIC); + INT_SET(agi->agi_versionnum, ARCH_CONVERT, XFS_AGI_VERSION); + INT_SET(agi->agi_seqno, ARCH_CONVERT, agno); + INT_SET(agi->agi_length, ARCH_CONVERT, agsize); + agi->agi_count = 0; + INT_SET(agi->agi_root, ARCH_CONVERT, XFS_IBT_BLOCK(mp)); + INT_SET(agi->agi_level, ARCH_CONVERT, 1); + agi->agi_freecount = 0; + INT_SET(agi->agi_newino, ARCH_CONVERT, NULLAGINO); + INT_SET(agi->agi_dirino, ARCH_CONVERT, NULLAGINO); + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, + NULLAGINO); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * BNO btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + block = XFS_BUF_TO_SBLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTB_MAGIC); + block->bb_level = 0; + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, + block, 1, mp->m_alloc_mxr[0]); + INT_SET(arec->ar_startblock, ARCH_CONVERT, + XFS_PREALLOC_BLOCKS(mp)); + INT_SET(arec->ar_blockcount, ARCH_CONVERT, + agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT)); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * CNT btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + block = XFS_BUF_TO_SBLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTC_MAGIC); + block->bb_level = 0; + INT_SET(block->bb_numrecs, ARCH_CONVERT, 1); + INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, + block, 1, mp->m_alloc_mxr[0]); + INT_SET(arec->ar_startblock, ARCH_CONVERT, + XFS_PREALLOC_BLOCKS(mp)); + INT_SET(arec->ar_blockcount, ARCH_CONVERT, + agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT)); + nfree += INT_GET(arec->ar_blockcount, ARCH_CONVERT); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + /* + * INO btree root block + */ + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + block = XFS_BUF_TO_SBLOCK(bp); + memset(block, 0, mp->m_sb.sb_blocksize); + INT_SET(block->bb_magic, ARCH_CONVERT, XFS_IBT_MAGIC); + block->bb_level = 0; + block->bb_numrecs = 0; + INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + error = xfs_bwrite(mp, bp); + if (error) { + goto error0; + } + } + xfs_trans_agblocks_delta(tp, nfree); + /* + * There are new blocks in the old last a.g. + */ + if (new) { + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agi = XFS_BUF_TO_AGI(bp); + INT_MOD(agi->agi_length, ARCH_CONVERT, new); + ASSERT(nagcount == oagcount || + INT_GET(agi->agi_length, ARCH_CONVERT) == + mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agf = XFS_BUF_TO_AGF(bp); + INT_MOD(agf->agf_length, ARCH_CONVERT, new); + ASSERT(INT_GET(agf->agf_length, ARCH_CONVERT) == + INT_GET(agi->agi_length, ARCH_CONVERT)); + /* + * Free the new space. + */ + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, + INT_GET(agf->agf_length, ARCH_CONVERT) - new), new); + if (error) { + goto error0; + } + } + if (nagcount > oagcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); + if (nb > mp->m_sb.sb_dblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, + nb - mp->m_sb.sb_dblocks); + if (nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); + if (dpct) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + error = xfs_trans_commit(tp, 0, NULL); + if (error) { + return error; + } + /* New allocation groups fully initialized, so update mount struct */ + if (nagimax) + mp->m_maxagi = nagimax; + if (mp->m_sb.sb_imax_pct) { + __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + for (agno = 1; agno < nagcount; agno++) { + error = xfs_read_buf(mp, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) { + xfs_fs_cmn_err(CE_WARN, mp, + "error %d reading secondary superblock for ag %d", + error, agno); + break; + } + sbp = XFS_BUF_TO_SBP(bp); + xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS); + /* + * If we get an error writing out the alternate superblocks, + * just issue a warning and continue. The real work is + * already done and committed. + */ + if (!(error = xfs_bwrite(mp, bp))) { + continue; + } else { + xfs_fs_cmn_err(CE_WARN, mp, + "write error %d updating secondary superblock for ag %d", + error, agno); + break; /* no point in continuing */ + } + } + return 0; + + error0: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +static int +xfs_growfs_log_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_log_t *in) /* growfs log input struct */ +{ + xfs_extlen_t nb; + + nb = in->newblocks; + if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) + return XFS_ERROR(EINVAL); + if (nb == mp->m_sb.sb_logblocks && + in->isint == (mp->m_sb.sb_logstart != 0)) + return XFS_ERROR(EINVAL); + /* + * Moving the log is hard, need new interfaces to sync + * the log first, hold off all activity while moving it. + * Can have shorter or longer log in the same space, + * or transform internal to external log or vice versa. + */ + return XFS_ERROR(ENOSYS); +} + +/* + * protected versions of growfs function acquire and release locks on the mount + * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, + * XFS_IOC_FSGROWFSRT + */ + + +int +xfs_growfs_data( + xfs_mount_t *mp, + xfs_growfs_data_t *in) +{ + int error; + if (!cpsema(&mp->m_growlock)) + return XFS_ERROR(EWOULDBLOCK); + error = xfs_growfs_data_private(mp, in); + vsema(&mp->m_growlock); + return error; +} + +int +xfs_growfs_log( + xfs_mount_t *mp, + xfs_growfs_log_t *in) +{ + int error; + if (!cpsema(&mp->m_growlock)) + return XFS_ERROR(EWOULDBLOCK); + error = xfs_growfs_log_private(mp, in); + vsema(&mp->m_growlock); + return error; +} + +/* + * exported through ioctl XFS_IOC_FSCOUNTS + */ + +int +xfs_fs_counts( + xfs_mount_t *mp, + xfs_fsop_counts_t *cnt) +{ + unsigned long s; + + s = XFS_SB_LOCK(mp); + cnt->freedata = mp->m_sb.sb_fdblocks; + cnt->freertx = mp->m_sb.sb_frextents; + cnt->freeino = mp->m_sb.sb_ifree; + cnt->allocino = mp->m_sb.sb_icount; + XFS_SB_UNLOCK(mp, s); + return 0; +} + +/* + * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS + * + * xfs_reserve_blocks is called to set m_resblks + * in the in-core mount table. The number of unused reserved blocks + * is kept in m_resbls_avail. + * + * Reserve the requested number of blocks if available. Otherwise return + * as many as possible to satisfy the request. The actual number + * reserved are returned in outval + * + * A null inval pointer indicates that only the current reserved blocks + * available should be returned no settings are changed. + */ + +int +xfs_reserve_blocks( + xfs_mount_t *mp, + __uint64_t *inval, + xfs_fsop_resblks_t *outval) +{ + __int64_t lcounter, delta; + __uint64_t request; + unsigned long s; + + /* If inval is null, report current values and return */ + + if (inval == (__uint64_t *)NULL) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + return(0); + } + + request = *inval; + s = XFS_SB_LOCK(mp); + + /* + * If our previous reservation was larger than the current value, + * then move any unused blocks back to the free pool. + */ + + if (mp->m_resblks > request) { + lcounter = mp->m_resblks_avail - request; + if (lcounter > 0) { /* release unused blocks */ + mp->m_sb.sb_fdblocks += lcounter; + mp->m_resblks_avail -= lcounter; + } + mp->m_resblks = request; + } else { + delta = request - mp->m_resblks; + lcounter = mp->m_sb.sb_fdblocks - delta; + if (lcounter < 0) { + /* We can't satisfy the request, just get what we can */ + mp->m_resblks += mp->m_sb.sb_fdblocks; + mp->m_resblks_avail += mp->m_sb.sb_fdblocks; + mp->m_sb.sb_fdblocks = 0; + } else { + mp->m_sb.sb_fdblocks = lcounter; + mp->m_resblks = request; + mp->m_resblks_avail += delta; + } + } + + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + XFS_SB_UNLOCK(mp, s); + return(0); +} + +void +xfs_fs_log_dummy(xfs_mount_t *mp) +{ + xfs_trans_t *tp; + xfs_inode_t *ip; + + + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + atomic_inc(&mp->m_active_trans); + if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + xfs_trans_cancel(tp, 0); + return; + } + + ip = mp->m_rootip; + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + xfs_trans_commit(tp, 0, NULL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + +int +xfs_fs_goingdown( + xfs_mount_t *mp, + __uint32_t inflags) +{ + switch (inflags) { + case XFS_FSOP_GOING_FLAGS_DEFAULT: { + struct vfs *vfsp = XFS_MTOVFS(mp); + struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); + + if (sb) { + xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); + thaw_bdev(sb->s_bdev, sb); + } + + break; + } + case XFS_FSOP_GOING_FLAGS_LOGFLUSH: + xfs_force_shutdown(mp, XFS_FORCE_UMOUNT); + break; + case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: + xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR); + break; + default: + return XFS_ERROR(EINVAL); + } + + return 0; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h new file mode 100644 index 00000000000..b61486173a6 --- /dev/null +++ b/fs/xfs/xfs_fsops.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_FSOPS_H__ +#define __XFS_FSOPS_H__ + +int +xfs_fs_geometry( + xfs_mount_t *mp, + xfs_fsop_geom_t *geo, + int new_version); + +int +xfs_growfs_data( + xfs_mount_t *mp, + xfs_growfs_data_t *in); + +int +xfs_growfs_log( + xfs_mount_t *mp, + xfs_growfs_log_t *in); + +int +xfs_fs_counts( + xfs_mount_t *mp, + xfs_fsop_counts_t *cnt); + +int +xfs_reserve_blocks( + xfs_mount_t *mp, + __uint64_t *inval, + xfs_fsop_resblks_t *outval); + +int +xfs_fs_goingdown( + xfs_mount_t *mp, + __uint32_t inflags); + +#endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c new file mode 100644 index 00000000000..ce5fee9eaec --- /dev/null +++ b/fs/xfs/xfs_ialloc.c @@ -0,0 +1,1401 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_bmap.h" + +/* + * Log specified fields for the inode given by bp and off. + */ +STATIC void +xfs_ialloc_log_di( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* inode buffer */ + int off, /* index of inode in buffer */ + int fields) /* bitmask of fields to log */ +{ + int first; /* first byte number */ + int ioffset; /* off in bytes */ + int last; /* last byte number */ + xfs_mount_t *mp; /* mount point structure */ + static const short offsets[] = { /* field offsets */ + /* keep in sync with bits */ + offsetof(xfs_dinode_core_t, di_magic), + offsetof(xfs_dinode_core_t, di_mode), + offsetof(xfs_dinode_core_t, di_version), + offsetof(xfs_dinode_core_t, di_format), + offsetof(xfs_dinode_core_t, di_onlink), + offsetof(xfs_dinode_core_t, di_uid), + offsetof(xfs_dinode_core_t, di_gid), + offsetof(xfs_dinode_core_t, di_nlink), + offsetof(xfs_dinode_core_t, di_projid), + offsetof(xfs_dinode_core_t, di_pad), + offsetof(xfs_dinode_core_t, di_atime), + offsetof(xfs_dinode_core_t, di_mtime), + offsetof(xfs_dinode_core_t, di_ctime), + offsetof(xfs_dinode_core_t, di_size), + offsetof(xfs_dinode_core_t, di_nblocks), + offsetof(xfs_dinode_core_t, di_extsize), + offsetof(xfs_dinode_core_t, di_nextents), + offsetof(xfs_dinode_core_t, di_anextents), + offsetof(xfs_dinode_core_t, di_forkoff), + offsetof(xfs_dinode_core_t, di_aformat), + offsetof(xfs_dinode_core_t, di_dmevmask), + offsetof(xfs_dinode_core_t, di_dmstate), + offsetof(xfs_dinode_core_t, di_flags), + offsetof(xfs_dinode_core_t, di_gen), + offsetof(xfs_dinode_t, di_next_unlinked), + offsetof(xfs_dinode_t, di_u), + offsetof(xfs_dinode_t, di_a), + sizeof(xfs_dinode_t) + }; + + + ASSERT(offsetof(xfs_dinode_t, di_core) == 0); + ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0); + mp = tp->t_mountp; + /* + * Get the inode-relative first and last bytes for these fields + */ + xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last); + /* + * Convert to buffer offsets and log it. + */ + ioffset = off << mp->m_sb.sb_inodelog; + first += ioffset; + last += ioffset; + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * Allocation group level functions. + */ + +/* + * Allocate new inodes in the allocation group specified by agbp. + * Return 0 for success, else error code. + */ +STATIC int /* error code or 0 */ +xfs_ialloc_ag_alloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* alloc group buffer */ + int *alloc) +{ + xfs_agi_t *agi; /* allocation group header */ + xfs_alloc_arg_t args; /* allocation argument structure */ + int blks_per_cluster; /* fs blocks per inode cluster */ + xfs_btree_cur_t *cur; /* inode btree cursor */ + xfs_daddr_t d; /* disk addr of buffer */ + int error; + xfs_buf_t *fbuf; /* new free inodes' buffer */ + xfs_dinode_t *free; /* new free inode structure */ + int i; /* inode counter */ + int j; /* block counter */ + int nbufs; /* num bufs of new inodes */ + xfs_agino_t newino; /* new first inode's number */ + xfs_agino_t newlen; /* new number of inodes */ + int ninodes; /* num inodes per buf */ + xfs_agino_t thisino; /* current inode number, for loop */ + int version; /* inode version number to use */ + int isaligned; /* inode allocation at stripe unit */ + /* boundary */ + xfs_dinode_core_t dic; /* a dinode_core to copy to new */ + /* inodes */ + + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * Locking will ensure that we don't have two callers in here + * at one time. + */ + newlen = XFS_IALLOC_INODES(args.mp); + if (args.mp->m_maxicount && + args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + return XFS_ERROR(ENOSPC); + args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + /* + * Set the alignment for the allocation. + * If stripe alignment is turned on then align at stripe unit + * boundary. + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size pieces, + * so don't need alignment anyway. + */ + isaligned = 0; + if (args.mp->m_sinoalign) { + ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + args.alignment = args.mp->m_dalign; + isaligned = 1; + } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && + args.mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) + args.alignment = args.mp->m_sb.sb_inoalignmt; + else + args.alignment = 1; + agi = XFS_BUF_TO_AGI(agbp); + /* + * Need to figure out where to allocate the inode blocks. + * Ideally they should be spaced out through the a.g. + * For now, just allocate blocks up front. + */ + args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT); + args.fsbno = XFS_AGB_TO_FSB(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT), + args.agbno); + /* + * Allocate a fixed-size extent of inodes. + */ + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.mod = args.total = args.wasdel = args.isfl = args.userdata = + args.minalignslop = 0; + args.prod = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + + /* + * If stripe alignment is turned on, then try again with cluster + * alignment. + */ + if (isaligned && args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT); + args.fsbno = XFS_AGB_TO_FSB(args.mp, + INT_GET(agi->agi_seqno, ARCH_CONVERT), args.agbno); + if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && + args.mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) + args.alignment = args.mp->m_sb.sb_inoalignmt; + else + args.alignment = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + if (args.fsbno == NULLFSBLOCK) { + *alloc = 0; + return 0; + } + ASSERT(args.len == args.minlen); + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + /* + * Loop over the new block(s), filling in the inodes. + * For small block sizes, manipulate the inodes in buffers + * which are multiples of the blocks size. + */ + if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) { + blks_per_cluster = 1; + nbufs = (int)args.len; + ninodes = args.mp->m_sb.sb_inopblock; + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) / + args.mp->m_sb.sb_blocksize; + nbufs = (int)args.len / blks_per_cluster; + ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock; + } + /* + * Figure out what version number to use in the inodes we create. + * If the superblock version has caught up to the one that supports + * the new inode format, then use the new inode version. Otherwise + * use the old version so that old kernels will continue to be + * able to use the file system. + */ + if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb)) + version = XFS_DINODE_VERSION_2; + else + version = XFS_DINODE_VERSION_1; + + memset(&dic, 0, sizeof(xfs_dinode_core_t)); + INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC); + INT_SET(dic.di_version, ARCH_CONVERT, version); + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT), + args.agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d, + args.mp->m_bsize * blks_per_cluster, + XFS_BUF_LOCK); + ASSERT(fbuf); + ASSERT(!XFS_BUF_GETERROR(fbuf)); + /* + * Loop over the inodes in this buffer. + */ + + for (i = 0; i < ninodes; i++) { + free = XFS_MAKE_IPTR(args.mp, fbuf, i); + memcpy(&(free->di_core), &dic, sizeof(xfs_dinode_core_t)); + INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + xfs_ialloc_log_di(tp, fbuf, i, + XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED); + } + xfs_trans_inode_alloc_buf(tp, fbuf); + } + INT_MOD(agi->agi_count, ARCH_CONVERT, newlen); + INT_MOD(agi->agi_freecount, ARCH_CONVERT, newlen); + down_read(&args.mp->m_peraglock); + args.mp->m_perag[INT_GET(agi->agi_seqno, ARCH_CONVERT)].pagi_freecount += newlen; + up_read(&args.mp->m_peraglock); + INT_SET(agi->agi_newino, ARCH_CONVERT, newino); + /* + * Insert records describing the new inode chunk into the btree. + */ + cur = xfs_btree_init_cursor(args.mp, tp, agbp, + INT_GET(agi->agi_seqno, ARCH_CONVERT), + XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + if ((error = xfs_inobt_lookup_eq(cur, thisino, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + if ((error = xfs_inobt_insert(cur, &i))) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + /* + * Log allocation group header fields + */ + xfs_ialloc_log_agi(tp, agbp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); + /* + * Modify/log superblock values for inode count and inode free count. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); + *alloc = 1; + return 0; +} + +STATIC __inline xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor == mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +/* + * Select an allocation group to look for a free inode in, based on the parent + * inode and then mode. Return the allocation group buffer. + */ +STATIC xfs_buf_t * /* allocation group buffer */ +xfs_ialloc_ag_select( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent directory inode number */ + mode_t mode, /* bits set to indicate file type */ + int okalloc) /* ok to allocate more space */ +{ + xfs_buf_t *agbp; /* allocation group header buffer */ + xfs_agnumber_t agcount; /* number of ag's in the filesystem */ + xfs_agnumber_t agno; /* current ag number */ + int flags; /* alloc buffer locking flags */ + xfs_extlen_t ineed; /* blocks needed for inode allocation */ + xfs_extlen_t longest = 0; /* longest extent available */ + xfs_mount_t *mp; /* mount point structure */ + int needspace; /* file mode implies space allocated */ + xfs_perag_t *pag; /* per allocation group data */ + xfs_agnumber_t pagno; /* parent (starting) ag number */ + + /* + * Files of these types need at least one block if length > 0 + * (and they won't fit in the inode, but that's hard to figure out). + */ + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + mp = tp->t_mountp; + agcount = mp->m_maxagi; + if (S_ISDIR(mode)) + pagno = xfs_ialloc_next_ag(mp); + else { + pagno = XFS_INO_TO_AGNO(mp, parent); + if (pagno >= agcount) + pagno = 0; + } + ASSERT(pagno < agcount); + /* + * Loop through allocation groups, looking for one with a little + * free space in it. Note we don't look for free inodes, exactly. + * Instead, we include whether there is a need to allocate inodes + * to mean that blocks must be allocated for them, + * if none are currently free. + */ + agno = pagno; + flags = XFS_ALLOC_FLAG_TRYLOCK; + down_read(&mp->m_peraglock); + for (;;) { + pag = &mp->m_perag[agno]; + if (!pag->pagi_init) { + if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + } else + agbp = NULL; + + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto unlock_nextag; + } + + /* + * Is there enough free space for the file plus a block + * of inodes (if we need to allocate some)? + */ + ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); + if (ineed && !pag->pagf_init) { + if (agbp == NULL && + xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + (void)xfs_alloc_pagf_init(mp, tp, agno, flags); + } + if (!ineed || pag->pagf_init) { + if (ineed && !(longest = pag->pagf_longest)) + longest = pag->pagf_flcount > 0; + if (!ineed || + (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed && + okalloc)) { + if (agbp == NULL && + xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { + agbp = NULL; + goto nextag; + } + up_read(&mp->m_peraglock); + return agbp; + } + } +unlock_nextag: + if (agbp) + xfs_trans_brelse(tp, agbp); +nextag: + /* + * No point in iterating over the rest, if we're shutting + * down. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + up_read(&mp->m_peraglock); + return (xfs_buf_t *)0; + } + agno++; + if (agno >= agcount) + agno = 0; + if (agno == pagno) { + if (flags == 0) { + up_read(&mp->m_peraglock); + return (xfs_buf_t *)0; + } + flags = 0; + } + } +} + +/* + * Visible inode allocation functions. + */ + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * The arguments IO_agbp and alloc_done are defined to work within + * the constraint of one allocation per transaction. + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. On the first call, + * IO_agbp should be set to NULL. If an inode is available, + * i.e., xfs_dialloc() did not need to do an allocation, an inode + * number is returned. In this case, IO_agbp would be set to the + * current ag_buf and alloc_done set to false. + * If an allocation needed to be done, xfs_dialloc would return + * the current ag_buf in IO_agbp and set alloc_done to true. + * The caller should then commit the current transaction, allocate a new + * transaction, and call xfs_dialloc() again, passing in the previous + * value of IO_agbp. IO_agbp should be held across the transactions. + * Since the agbp is locked across the two calls, the second call is + * guaranteed to have a free inode available. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + mode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ + boolean_t *alloc_done, /* true if we needed to replenish + inode freelist */ + xfs_ino_t *inop) /* inode number allocated */ +{ + xfs_agnumber_t agcount; /* number of allocation groups */ + xfs_buf_t *agbp; /* allocation group header's buffer */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agi_t *agi; /* allocation group header structure */ + xfs_btree_cur_t *cur; /* inode allocation btree cursor */ + int error; /* error return value */ + int i; /* result code */ + int ialloced; /* inode allocation status */ + int noroom = 0; /* no space for inode blk allocation */ + xfs_ino_t ino; /* fs-relative inode to be returned */ + /* REFERENCED */ + int j; /* result code */ + xfs_mount_t *mp; /* file system mount structure */ + int offset; /* index of inode in chunk */ + xfs_agino_t pagino; /* parent's a.g. relative inode # */ + xfs_agnumber_t pagno; /* parent's allocation group number */ + xfs_inobt_rec_t rec; /* inode allocation record */ + xfs_agnumber_t tagno; /* testing allocation group number */ + xfs_btree_cur_t *tcur; /* temp cursor */ + xfs_inobt_rec_t trec; /* temp inode allocation record */ + + + if (*IO_agbp == NULL) { + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + /* + * Couldn't find an allocation group satisfying the + * criteria, give up. + */ + if (!agbp) { + *inop = NULLFSINO; + return 0; + } + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); + } else { + /* + * Continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); + ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0); + } + mp = tp->t_mountp; + agcount = mp->m_sb.sb_agcount; + agno = INT_GET(agi->agi_seqno, ARCH_CONVERT); + tagno = agno; + pagno = XFS_INO_TO_AGNO(mp, parent); + pagino = XFS_INO_TO_AGINO(mp, parent); + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + + if (mp->m_maxicount && + mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + *alloc_done = B_FALSE; + while (!agi->agi_freecount) { + /* + * Don't do anything if we're not supposed to allocate + * any blocks, just go on to the next ag. + */ + if (okalloc) { + /* + * Try to allocate some new inodes in the allocation + * group. + */ + if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { + xfs_trans_brelse(tp, agbp); + if (error == ENOSPC) { + *inop = NULLFSINO; + return 0; + } else + return error; + } + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0); + *alloc_done = B_TRUE; + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + } + /* + * If it failed, give up on this ag. + */ + xfs_trans_brelse(tp, agbp); + /* + * Go on to the next ag: get its ag header. + */ +nextag: + if (++tagno == agcount) + tagno = 0; + if (tagno == agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } + down_read(&mp->m_peraglock); + if (mp->m_perag[tagno].pagi_inodeok == 0) { + up_read(&mp->m_peraglock); + goto nextag; + } + error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); + up_read(&mp->m_peraglock); + if (error) + goto nextag; + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); + } + /* + * Here with an allocation group that has a free inode. + * Reset agno since we may have chosen a new ag in the + * loop above. + */ + agno = tagno; + *IO_agbp = NULL; + cur = xfs_btree_init_cursor(mp, tp, agbp, INT_GET(agi->agi_seqno, ARCH_CONVERT), + XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = INT_GET(agi->agi_newino, ARCH_CONVERT); +#ifdef DEBUG + if (cur->bc_nlevels == 1) { + int freecount = 0; + + if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + do { + if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } while (i == 1); + + ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || + XFS_FORCED_SHUTDOWN(mp)); + } +#endif + /* + * If in the same a.g. as the parent, try to get near the parent. + */ + if (pagno == agno) { + if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) + goto error0; + if (i != 0 && + (error = xfs_inobt_get_rec(cur, &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, &j)) == 0 && + j == 1 && + rec.ir_freecount > 0) { + /* + * Found a free inode in the same chunk + * as parent, done. + */ + } + /* + * In the same a.g. as parent, but parent's chunk is full. + */ + else { + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + + if (error) + goto error0; + ASSERT(i == 1); + ASSERT(j == 1); + /* + * Duplicate the cursor, search left & right + * simultaneously. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + goto error0; + /* + * Search left with tcur, back up 1 record. + */ + if ((error = xfs_inobt_decrement(tcur, 0, &i))) + goto error1; + doneleft = !i; + if (!doneleft) { + if ((error = xfs_inobt_get_rec(tcur, + &trec.ir_startino, + &trec.ir_freecount, + &trec.ir_free, &i))) + goto error1; + XFS_WANT_CORRUPTED_GOTO(i == 1, error1); + } + /* + * Search right with cur, go forward 1 record. + */ + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error1; + doneright = !i; + if (!doneright) { + if ((error = xfs_inobt_get_rec(cur, + &rec.ir_startino, + &rec.ir_freecount, + &rec.ir_free, &i))) + goto error1; + XFS_WANT_CORRUPTED_GOTO(i == 1, error1); + } + /* + * Loop until we find the closest inode chunk + * with a free one. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode + chunk this time */ + + /* + * Figure out which block is closer, + * if both are valid. + */ + if (!doneleft && !doneright) + useleft = + pagino - + (trec.ir_startino + + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + else + useleft = !doneleft; + /* + * If checking the left, does it have + * free inodes? + */ + if (useleft && trec.ir_freecount) { + /* + * Yes, set it up as the chunk to use. + */ + rec = trec; + xfs_btree_del_cursor(cur, + XFS_BTREE_NOERROR); + cur = tcur; + break; + } + /* + * If checking the right, does it have + * free inodes? + */ + if (!useleft && rec.ir_freecount) { + /* + * Yes, it's already set up. + */ + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + break; + } + /* + * If used the left, get another one + * further left. + */ + if (useleft) { + if ((error = xfs_inobt_decrement(tcur, 0, + &i))) + goto error1; + doneleft = !i; + if (!doneleft) { + if ((error = xfs_inobt_get_rec( + tcur, + &trec.ir_startino, + &trec.ir_freecount, + &trec.ir_free, &i))) + goto error1; + XFS_WANT_CORRUPTED_GOTO(i == 1, + error1); + } + } + /* + * If used the right, get another one + * further right. + */ + else { + if ((error = xfs_inobt_increment(cur, 0, + &i))) + goto error1; + doneright = !i; + if (!doneright) { + if ((error = xfs_inobt_get_rec( + cur, + &rec.ir_startino, + &rec.ir_freecount, + &rec.ir_free, &i))) + goto error1; + XFS_WANT_CORRUPTED_GOTO(i == 1, + error1); + } + } + } + ASSERT(!doneleft || !doneright); + } + } + /* + * In a different a.g. from the parent. + * See if the most recently allocated block has any free. + */ + else if (INT_GET(agi->agi_newino, ARCH_CONVERT) != NULLAGINO) { + if ((error = xfs_inobt_lookup_eq(cur, + INT_GET(agi->agi_newino, ARCH_CONVERT), 0, 0, &i))) + goto error0; + if (i == 1 && + (error = xfs_inobt_get_rec(cur, &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, &j)) == 0 && + j == 1 && + rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group still has + * a free inode. + */ + } + /* + * None left in the last group, search the whole a.g. + */ + else { + if (error) + goto error0; + if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) + goto error0; + ASSERT(i == 1); + for (;;) { + if ((error = xfs_inobt_get_rec(cur, + &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + } + offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + XFS_INOBT_CLR_FREE(&rec, offset); + rec.ir_freecount--; + if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, + rec.ir_free))) + goto error0; + INT_MOD(agi->agi_freecount, ARCH_CONVERT, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + down_read(&mp->m_peraglock); + mp->m_perag[tagno].pagi_freecount--; + up_read(&mp->m_peraglock); +#ifdef DEBUG + if (cur->bc_nlevels == 1) { + int freecount = 0; + + if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) + goto error0; + do { + if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } while (i == 1); + ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || + XFS_FORCED_SHUTDOWN(mp)); + } +#endif + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + *inop = ino; + return 0; +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + xfs_bmap_free_t *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino) /* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + xfs_buf_t *agbp; /* buffer containing allocation group header */ + xfs_agino_t agino; /* inode number relative to allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agi_t *agi; /* allocation group header */ + xfs_btree_cur_t *cur; /* inode btree cursor */ + int error; /* error return value */ + int i; /* result code */ + int ilen; /* inodes in an inode cluster */ + xfs_mount_t *mp; /* mount structure for filesystem */ + int off; /* offset of inode in inode chunk */ + xfs_inobt_rec_t rec; /* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + cmn_err(CE_WARN, + "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", + agno, mp->m_sb.sb_agcount, mp->m_fsname); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + cmn_err(CE_WARN, + "xfs_difree: inode != XFS_AGINO_TO_INO() (%d != %d) on %s. Returning EINVAL.", + inode, XFS_AGINO_TO_INO(mp, agno, agino), mp->m_fsname); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + cmn_err(CE_WARN, + "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", + agbno, mp->m_sb.sb_agblocks, mp->m_fsname); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + down_read(&mp->m_peraglock); + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + up_read(&mp->m_peraglock); + if (error) { + cmn_err(CE_WARN, + "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + agi = XFS_BUF_TO_AGI(agbp); + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); + ASSERT(agbno < INT_GET(agi->agi_length, ARCH_CONVERT)); + /* + * Initialize the cursor. + */ + cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, + (xfs_inode_t *)0, 0); +#ifdef DEBUG + if (cur->bc_nlevels == 1) { + int freecount = 0; + + if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) + goto error0; + do { + if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, + &rec.ir_freecount, &rec.ir_free, &i))) + goto error0; + if (i) { + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } + } while (i == 1); + ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || + XFS_FORCED_SHUTDOWN(mp)); + } +#endif + /* + * Look for the entry describing this inode. + */ + if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { + cmn_err(CE_WARN, + "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", + error, mp->m_fsname); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, + &rec.ir_free, &i))) { + cmn_err(CE_WARN, + "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Get the offset in the inode chunk. + */ + off = agino - rec.ir_startino; + ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); + ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); + /* + * Mark the inode free & increment the count. + */ + XFS_INOBT_SET_FREE(&rec, off); + rec.ir_freecount++; + + /* + * When an inode cluster is free, it becomes elgible for removal + */ + if ((mp->m_flags & XFS_MOUNT_IDELETE) && + (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + + *delete = 1; + *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + + /* + * Remove the inode cluster from the AGI B+Tree, adjust the + * AGI and Superblock inode counts, and mark the disk space + * to be freed when the transaction is committed. + */ + ilen = XFS_IALLOC_INODES(mp); + INT_MOD(agi->agi_count, ARCH_CONVERT, -ilen); + INT_MOD(agi->agi_freecount, ARCH_CONVERT, -(ilen - 1)); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + down_read(&mp->m_peraglock); + mp->m_perag[agno].pagi_freecount -= ilen - 1; + up_read(&mp->m_peraglock); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); + + if ((error = xfs_inobt_delete(cur, &i))) { + cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n", + error, mp->m_fsname); + goto error0; + } + + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, + agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), + XFS_IALLOC_BLOCKS(mp), flist, mp); + } else { + *delete = 0; + + if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { + cmn_err(CE_WARN, + "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + goto error0; + } + /* + * Change the inode free counts and log the ag/sb changes. + */ + INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + down_read(&mp->m_peraglock); + mp->m_perag[agno].pagi_freecount++; + up_read(&mp->m_peraglock); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); + } + +#ifdef DEBUG + if (cur->bc_nlevels == 1) { + int freecount = 0; + + if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) + goto error0; + do { + if ((error = xfs_inobt_get_rec(cur, + &rec.ir_startino, + &rec.ir_freecount, + &rec.ir_free, &i))) + goto error0; + if (i) { + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } + } while (i == 1); + ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || + XFS_FORCED_SHUTDOWN(mp)); + } +#endif + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Return the location of the inode in bno/off, for mapping it into a buffer. + */ +/*ARGSUSED*/ +int +xfs_dilocate( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + xfs_fsblock_t *bno, /* output: block containing inode */ + int *len, /* output: num blocks in inode cluster */ + int *off, /* output: index in block of inode */ + uint flags) /* flags concerning inode lookup */ +{ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_buf_t *agbp; /* agi buffer */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agnumber_t agno; /* allocation group number */ + int blks_per_cluster; /* num blocks per inode cluster */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agino_t chunk_agino; /* first agino in inode chunk */ + __int32_t chunk_cnt; /* count of free inodes in chunk */ + xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + xfs_btree_cur_t *cur; /* inode btree cursor */ + int error; /* error code */ + int i; /* temp state */ + int offset; /* index of inode in its buffer */ + int offset_agbno; /* blks from chunk start to inode */ + + ASSERT(ino != NULLFSINO); + /* + * Split up the inode number into its parts. + */ + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { +#ifdef DEBUG + if (agno >= mp->m_sb.sb_agcount) { + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_dilocate: agno (%d) >= " + "mp->m_sb.sb_agcount (%d)", + agno, mp->m_sb.sb_agcount); + } + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_dilocate: agbno (0x%llx) >= " + "mp->m_sb.sb_agblocks (0x%lx)", + (unsigned long long) agbno, + (unsigned long) mp->m_sb.sb_agblocks); + } + if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_dilocate: ino (0x%llx) != " + "XFS_AGINO_TO_INO(mp, agno, agino) " + "(0x%llx)", + ino, XFS_AGINO_TO_INO(mp, agno, agino)); + } +#endif /* DEBUG */ + return XFS_ERROR(EINVAL); + } + if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) || + !(flags & XFS_IMAP_LOOKUP)) { + offset = XFS_INO_TO_OFFSET(mp, ino); + ASSERT(offset < mp->m_sb.sb_inopblock); + *bno = XFS_AGB_TO_FSB(mp, agno, agbno); + *off = offset; + *len = 1; + return 0; + } + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + if (*bno != NULLFSBLOCK) { + offset = XFS_INO_TO_OFFSET(mp, ino); + ASSERT(offset < mp->m_sb.sb_inopblock); + cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno); + *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + + offset; + *len = blks_per_cluster; + return 0; + } + if (mp->m_inoalign_mask) { + offset_agbno = agbno & mp->m_inoalign_mask; + chunk_agbno = agbno - offset_agbno; + } else { + down_read(&mp->m_peraglock); + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + up_read(&mp->m_peraglock); + if (error) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + "xfs_ialloc_read_agi() returned " + "error %d, agno %d", + error, agno); +#endif /* DEBUG */ + return error; + } + cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, + (xfs_inode_t *)0, 0); + if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + "xfs_inobt_lookup_le() failed"); +#endif /* DEBUG */ + goto error0; + } + if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, + &chunk_free, &i))) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + "xfs_inobt_get_rec() failed"); +#endif /* DEBUG */ + goto error0; + } + if (i == 0) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + "xfs_inobt_get_rec() failed"); +#endif /* DEBUG */ + error = XFS_ERROR(EINVAL); + } + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); + offset_agbno = agbno - chunk_agbno; + } + ASSERT(agbno >= chunk_agbno); + cluster_agbno = chunk_agbno + + ((offset_agbno / blks_per_cluster) * blks_per_cluster); + offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + + XFS_INO_TO_OFFSET(mp, ino); + *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno); + *off = offset; + *len = blks_per_cluster; + return 0; +error0: + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> + XFS_INODES_PER_CHUNK_LOG; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_in_maxlevels = level; +} + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* allocation group header buffer */ + int fields) /* bitmask of fields to log */ +{ + int first; /* first byte number */ + int last; /* last byte number */ + static const short offsets[] = { /* field starting offsets */ + /* keep in sync with bit definitions */ + offsetof(xfs_agi_t, agi_magicnum), + offsetof(xfs_agi_t, agi_versionnum), + offsetof(xfs_agi_t, agi_seqno), + offsetof(xfs_agi_t, agi_length), + offsetof(xfs_agi_t, agi_count), + offsetof(xfs_agi_t, agi_root), + offsetof(xfs_agi_t, agi_level), + offsetof(xfs_agi_t, agi_freecount), + offsetof(xfs_agi_t, agi_newino), + offsetof(xfs_agi_t, agi_dirino), + offsetof(xfs_agi_t, agi_unlinked), + sizeof(xfs_agi_t) + }; +#ifdef DEBUG + xfs_agi_t *agi; /* allocation group header */ + + agi = XFS_BUF_TO_AGI(bp); + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); +#endif + /* + * Compute byte offsets for the first and last fields. + */ + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + /* + * Log the allocation group inode header buffer. + */ + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * Read in the allocation group header (inode allocation section) + */ +int +xfs_ialloc_read_agi( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* allocation group hdr buf */ +{ + xfs_agi_t *agi; /* allocation group header */ + int agi_ok; /* agi is consistent */ + xfs_buf_t *bp; /* allocation group hdr buf */ + xfs_perag_t *pag; /* per allocation group data */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (error) + return error; + ASSERT(bp && !XFS_BUF_GETERROR(bp)); + + /* + * Validate the magic number of the agi block. + */ + agi = XFS_BUF_TO_AGI(bp); + agi_ok = + INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION( + INT_GET(agi->agi_versionnum, ARCH_CONVERT)); + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW, + mp, agi); + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + pag = &mp->m_perag[agno]; + if (!pag->pagi_init) { + pag->pagi_freecount = INT_GET(agi->agi_freecount, ARCH_CONVERT); + pag->pagi_init = 1; + } else { + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == + INT_GET(agi->agi_freecount, ARCH_CONVERT) + || XFS_FORCED_SHUTDOWN(mp)); + } + +#ifdef DEBUG + { + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); + } +#endif + + XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF); + *bpp = bp; + return 0; +} diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h new file mode 100644 index 00000000000..db6d0015cec --- /dev/null +++ b/fs/xfs/xfs_ialloc.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IALLOC_H__ +#define __XFS_IALLOC_H__ + +struct xfs_buf; +struct xfs_dinode; +struct xfs_mount; +struct xfs_trans; + +/* + * Allocation parameters for inode allocation. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_INODES) +int xfs_ialloc_inodes(struct xfs_mount *mp); +#define XFS_IALLOC_INODES(mp) xfs_ialloc_inodes(mp) +#else +#define XFS_IALLOC_INODES(mp) ((mp)->m_ialloc_inos) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_BLOCKS) +xfs_extlen_t xfs_ialloc_blocks(struct xfs_mount *mp); +#define XFS_IALLOC_BLOCKS(mp) xfs_ialloc_blocks(mp) +#else +#define XFS_IALLOC_BLOCKS(mp) ((mp)->m_ialloc_blks) +#endif + +/* + * For small block file systems, move inodes in clusters of this size. + * When we don't have a lot of memory, however, we go a bit smaller + * to reduce the number of AGI and ialloc btree blocks we need to keep + * around for xfs_dilocate(). We choose which one to use in + * xfs_mount_int(). + */ +#define XFS_INODE_BIG_CLUSTER_SIZE 8192 +#define XFS_INODE_SMALL_CLUSTER_SIZE 4096 +#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* + * Make an inode pointer out of the buffer/offset. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MAKE_IPTR) +struct xfs_dinode *xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o); +#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o) +#else +#define XFS_MAKE_IPTR(mp,b,o) \ + ((xfs_dinode_t *)(xfs_buf_offset(b, (o) << (mp)->m_sb.sb_inodelog))) +#endif + +/* + * Find a free (set) bit in the inode bitmask. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_FIND_FREE) +int xfs_ialloc_find_free(xfs_inofree_t *fp); +#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp) +#else +#define XFS_IALLOC_FIND_FREE(fp) xfs_lowbit64(*(fp)) +#endif + + +#ifdef __KERNEL__ + +/* + * Prototypes for visible xfs_ialloc.c routines. + */ + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * To work within the constraint of one allocation per transaction, + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. If an inode is + * available without an allocation, agbp would be set to the current + * agbp and alloc_done set to false. + * If an allocation needed to be done, agbp would be set to the + * inode header of the allocation group and alloc_done set to true. + * The caller should then commit the current transaction and allocate a new + * transaction. xfs_dialloc() should then be called again with + * the agbp value returned from the previous call. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + * + * *agbp should be set to NULL on the first call, *alloc_done set to FALSE. + */ +int /* error */ +xfs_dialloc( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + mode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + struct xfs_buf **agbp, /* buf for a.g. inode header */ + boolean_t *alloc_done, /* an allocation was done to replenish + the free inodes */ + xfs_ino_t *inop); /* inode number allocated */ + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int /* error */ +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino); /* first inode in deleted cluster */ + +/* + * Return the location of the inode in bno/len/off, + * for mapping it into a buffer. + */ +int +xfs_dilocate( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + xfs_fsblock_t *bno, /* output: block containing inode */ + int *len, /* output: num blocks in cluster*/ + int *off, /* output: index in block of inode */ + uint flags); /* flags for inode btree lookup */ + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* allocation group header buffer */ + int fields); /* bitmask of fields to log */ + +/* + * Read in the allocation group header (inode allocation section) + */ +int /* error */ +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp); /* allocation group hdr buf */ + +#endif /* __KERNEL__ */ + +#endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c new file mode 100644 index 00000000000..2d4daecec99 --- /dev/null +++ b/fs/xfs/xfs_ialloc_btree.c @@ -0,0 +1,2094 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" + +/* + * Inode allocation management for XFS. + */ + +/* + * Prototypes for internal functions. + */ + +STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int); +STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); +STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *); +STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *); +STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *, + xfs_inobt_key_t *, xfs_btree_cur_t **, int *); +STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int); + +/* + * Internal functions. + */ + +/* + * Single level of the xfs_inobt_delete record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_inobt_delrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + xfs_buf_t *agbp; /* buffer for a.g. inode header */ + xfs_mount_t *mp; /* mount structure */ + xfs_agi_t *agi; /* allocation group inode header */ + xfs_inobt_block_t *block; /* btree block record/key lives in */ + xfs_agblock_t bno; /* btree block number */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_inobt_key_t key; /* kp points here if block is level 0 */ + xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */ + xfs_agblock_t lbno; /* left block's block number */ + xfs_buf_t *lbp; /* left block's buffer pointer */ + xfs_inobt_block_t *left; /* left btree block */ + xfs_inobt_key_t *lkp; /* left block key pointer */ + xfs_inobt_ptr_t *lpp; /* left block address pointer */ + int lrecs = 0; /* number of records in left block */ + xfs_inobt_rec_t *lrp; /* left block record pointer */ + xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */ + int ptr; /* index in btree block for this rec */ + xfs_agblock_t rbno; /* right block's block number */ + xfs_buf_t *rbp; /* right block's buffer pointer */ + xfs_inobt_block_t *right; /* right btree block */ + xfs_inobt_key_t *rkp; /* right block key pointer */ + xfs_inobt_rec_t *rp; /* pointer to btree records */ + xfs_inobt_ptr_t *rpp; /* right block address pointer */ + int rrecs = 0; /* number of records in right block */ + int numrecs; + xfs_inobt_rec_t *rrp; /* right block record pointer */ + xfs_btree_cur_t *tcur; /* temporary btree cursor */ + + mp = cur->bc_mp; + + /* + * Get the index of the entry being deleted, check for nothing there. + */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + *stat = 0; + return 0; + } + + /* + * Get the buffer & block containing the record or key/ptr. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Fail if we're off the end of the block. + */ + + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (ptr > numrecs) { + *stat = 0; + return 0; + } + /* + * It's a nonleaf. Excise the key and ptr being deleted, by + * sliding the entries past them down one. + * Log the changed areas of the block. + */ + if (level > 0) { + kp = XFS_INOBT_KEY_ADDR(block, 1, cur); + pp = XFS_INOBT_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = ptr; i < numrecs; i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) + return error; + } +#endif + if (ptr < numrecs) { + memmove(&kp[ptr - 1], &kp[ptr], + (numrecs - ptr) * sizeof(*kp)); + memmove(&pp[ptr - 1], &pp[ptr], + (numrecs - ptr) * sizeof(*kp)); + xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1); + xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1); + } + } + /* + * It's a leaf. Excise the record being deleted, by sliding the + * entries past it down one. Log the changed areas of the block. + */ + else { + rp = XFS_INOBT_REC_ADDR(block, 1, cur); + if (ptr < numrecs) { + memmove(&rp[ptr - 1], &rp[ptr], + (numrecs - ptr) * sizeof(*rp)); + xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1); + } + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + key.ir_startino = rp->ir_startino; + kp = &key; + } + } + /* + * Decrement and log the number of entries in the block. + */ + numrecs--; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); + /* + * Is this the root level? If so, we're almost done. + */ + if (level == cur->bc_nlevels - 1) { + /* + * If this is the root level, + * and there's only one entry left, + * and it's NOT the leaf level, + * then we can get rid of this level. + */ + if (numrecs == 1 && level > 0) { + agbp = cur->bc_private.i.agbp; + agi = XFS_BUF_TO_AGI(agbp); + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + bno = INT_GET(agi->agi_root, ARCH_CONVERT); + agi->agi_root = *pp; + INT_MOD(agi->agi_level, ARCH_CONVERT, -1); + /* + * Free the block. + */ + if ((error = xfs_free_extent(cur->bc_tp, + XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) + return error; + xfs_trans_binval(cur->bc_tp, bp); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_ROOT | XFS_AGI_LEVEL); + /* + * Update the cursor so there's one fewer level. + */ + cur->bc_bufs[level] = NULL; + cur->bc_nlevels--; + } else if (level > 0 && + (error = xfs_inobt_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1))) + return error; + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { + if (level > 0 && + (error = xfs_inobt_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT); + lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT); + bno = NULLAGBLOCK; + ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (rbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if ((error = xfs_inobt_increment(tcur, level, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + /* + * Grab a pointer to the block. + */ + rbp = tcur->bc_bufs[level]; + right = XFS_BUF_TO_INOBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_INOBT_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_inobt_lshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_INOBT_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level > 0 && + (error = xfs_inobt_decrement(cur, level, + &i))) + return error; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if (lbno != NULLAGBLOCK) { + xfs_btree_firstrec(tcur, level); + if ((error = xfs_inobt_decrement(tcur, level, &i))) + goto error0; + } + } + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (lbno != NULLAGBLOCK) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + xfs_btree_firstrec(tcur, level); + if ((error = xfs_inobt_decrement(tcur, level, &i))) + goto error0; + xfs_btree_firstrec(tcur, level); + /* + * Grab a pointer to the block. + */ + lbp = tcur->bc_bufs[level]; + left = XFS_BUF_TO_INOBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + goto error0; +#endif + /* + * Grab the current block number, for future use. + */ + bno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= + XFS_INOBT_BLOCK_MINRECS(level, cur)) { + if ((error = xfs_inobt_rshift(tcur, level, &i))) + goto error0; + if (i) { + ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= + XFS_INOBT_BLOCK_MINRECS(level, cur)); + xfs_btree_del_cursor(tcur, + XFS_BTREE_NOERROR); + if (level == 0) + cur->bc_ptrs[0]++; + *stat = 1; + return 0; + } + } + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * Delete the temp cursor, we're done with it. + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + /* + * If here, we need to do a join to keep the tree balanced. + */ + ASSERT(bno != NULLAGBLOCK); + /* + * See if we can join with the left neighbor block. + */ + if (lbno != NULLAGBLOCK && + lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rbno = bno; + right = block; + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + rbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.i.agno, lbno, 0, &lbp, + XFS_INO_BTREE_REF))) + return error; + left = XFS_BUF_TO_INOBT_BLOCK(lbp); + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + } + /* + * If that won't work, see if we can join with the right neighbor block. + */ + else if (rbno != NULLAGBLOCK && + rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lbno = bno; + left = block; + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + lbp = bp; + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.i.agno, rbno, 0, &rbp, + XFS_INO_BTREE_REF))) + return error; + right = XFS_BUF_TO_INOBT_BLOCK(rbp); + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + } + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + else { + if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i))) + return error; + *stat = 1; + return 0; + } + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + if (level > 0) { + /* + * It's a non-leaf. Move keys and pointers. + */ + lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur); + lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur); + rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); + rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < rrecs; i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memcpy(lkp, rkp, rrecs * sizeof(*lkp)); + memcpy(lpp, rpp, rrecs * sizeof(*lpp)); + xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* + * It's a leaf. Move records. + */ + lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur); + rrp = XFS_INOBT_REC_ADDR(right, 1, cur); + memcpy(lrp, rrp, rrecs * sizeof(*lrp)); + xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + xfs_btree_setbuf(cur, level, lbp); + cur->bc_ptrs[level] += lrecs; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if (level + 1 < cur->bc_nlevels && + (error = xfs_alloc_increment(cur, level + 1, &i))) + return error; + /* + * Fix up the number of records in the surviving block. + */ + lrecs += rrecs; + INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs); + /* + * Fix up the right block pointer in the surviving block, and log it. + */ + left->bb_rightsib = right->bb_rightsib; + xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there is a right sibling now, make it point to the + * remaining block. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_inobt_block_t *rrblock; + xfs_buf_t *rrbp; + + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, + &rrbp, XFS_INO_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); + xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * Free the deleting block. + */ + if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, + cur->bc_private.i.agno, rbno), 1))) + return error; + xfs_trans_binval(cur->bc_tp, rbp); + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + /* + * Return value means the next level up has something to do. + */ + *stat = 2; + return 0; + +error0: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int /* error */ +xfs_inobt_insrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to insert record at */ + xfs_agblock_t *bnop, /* i/o: block number inserted */ + xfs_inobt_rec_t *recp, /* i/o: record data inserted */ + xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + xfs_inobt_block_t *block; /* btree block record/key lives in */ + xfs_buf_t *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop index */ + xfs_inobt_key_t key; /* key value being inserted */ + xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */ + xfs_agblock_t nbno; /* block number of allocated block */ + xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ + xfs_inobt_key_t nkey; /* new key value, from split */ + xfs_inobt_rec_t nrec; /* new record value, for caller */ + int numrecs; + int optr; /* old ptr value */ + xfs_inobt_ptr_t *pp; /* pointer to btree addresses */ + int ptr; /* index in btree block for this rec */ + xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */ + + /* + * If we made it to the root level, allocate a new root block + * and we're done. + */ + if (level >= cur->bc_nlevels) { + error = xfs_inobt_newroot(cur, &i); + *bnop = NULLAGBLOCK; + *stat = i; + return error; + } + /* + * Make a key out of the record data to be inserted, and save it. + */ + key.ir_startino = recp->ir_startino; /* INT_: direct copy */ + optr = ptr = cur->bc_ptrs[level]; + /* + * If we're off the left edge, return failure. + */ + if (ptr == 0) { + *stat = 0; + return 0; + } + /* + * Get pointers to the btree buffer and block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; + /* + * Check that the new entry is being inserted in the right place. + */ + if (ptr <= numrecs) { + if (level == 0) { + rp = XFS_INOBT_REC_ADDR(block, ptr, cur); + xfs_btree_check_rec(cur->bc_btnum, recp, rp); + } else { + kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); + xfs_btree_check_key(cur->bc_btnum, &key, kp); + } + } +#endif + nbno = NULLAGBLOCK; + ncur = (xfs_btree_cur_t *)0; + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + /* + * First, try shifting an entry to the right neighbor. + */ + if ((error = xfs_inobt_rshift(cur, level, &i))) + return error; + if (i) { + /* nothing */ + } + /* + * Next, try shifting an entry to the left neighbor. + */ + else { + if ((error = xfs_inobt_lshift(cur, level, &i))) + return error; + if (i) { + optr = ptr = cur->bc_ptrs[level]; + } else { + /* + * Next, try splitting the current block + * in half. If this works we have to + * re-set our variables because + * we could be in a different block now. + */ + if ((error = xfs_inobt_split(cur, level, &nbno, + &nkey, &ncur, &i))) + return error; + if (i) { + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, + block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */ + } else { + /* + * Otherwise the insert fails. + */ + *stat = 0; + return 0; + } + } + } + } + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (level > 0) { + /* + * It's a non-leaf entry. Make a hole for the new data + * in the key and ptr regions of the block. + */ + kp = XFS_INOBT_KEY_ADDR(block, 1, cur); + pp = XFS_INOBT_PTR_ADDR(block, 1, cur); +#ifdef DEBUG + for (i = numrecs; i >= ptr; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level))) + return error; + } +#endif + memmove(&kp[ptr], &kp[ptr - 1], + (numrecs - ptr + 1) * sizeof(*kp)); + memmove(&pp[ptr], &pp[ptr - 1], + (numrecs - ptr + 1) * sizeof(*pp)); + /* + * Now stuff the new data in, bump numrecs and log the new data. + */ +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, *bnop, level))) + return error; +#endif + kp[ptr - 1] = key; /* INT_: struct copy */ + INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_inobt_log_keys(cur, bp, ptr, numrecs); + xfs_inobt_log_ptrs(cur, bp, ptr, numrecs); + } else { + /* + * It's a leaf entry. Make a hole for the new record. + */ + rp = XFS_INOBT_REC_ADDR(block, 1, cur); + memmove(&rp[ptr], &rp[ptr - 1], + (numrecs - ptr + 1) * sizeof(*rp)); + /* + * Now stuff the new record in, bump numrecs + * and log the new data. + */ + rp[ptr - 1] = *recp; /* INT_: struct copy */ + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_inobt_log_recs(cur, bp, ptr, numrecs); + } + /* + * Log the new number of records in the btree header. + */ + xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); +#ifdef DEBUG + /* + * Check that the key/record is in the right place, now. + */ + if (ptr < numrecs) { + if (level == 0) + xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, + rp + ptr); + else + xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, + kp + ptr); + } +#endif + /* + * If we inserted at the start of a block, update the parents' keys. + */ + if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1))) + return error; + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *bnop = nbno; + if (nbno != NULLAGBLOCK) { + *recp = nrec; /* INT_: struct copy */ + *curp = ncur; + } + *stat = 1; + return 0; +} + +/* + * Log header fields from a btree block. + */ +STATIC void +xfs_inobt_log_block( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short offsets[] = { /* table of offsets */ + offsetof(xfs_inobt_block_t, bb_magic), + offsetof(xfs_inobt_block_t, bb_level), + offsetof(xfs_inobt_block_t, bb_numrecs), + offsetof(xfs_inobt_block_t, bb_leftsib), + offsetof(xfs_inobt_block_t, bb_rightsib), + sizeof(xfs_inobt_block_t) + }; + + xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * Log keys from a btree block (nonleaf). + */ +STATIC void +xfs_inobt_log_keys( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int kfirst, /* index of first key to log */ + int klast) /* index of last key to log */ +{ + xfs_inobt_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + xfs_inobt_key_t *kp; /* key pointer in btree block */ + int last; /* last byte offset logged */ + + block = XFS_BUF_TO_INOBT_BLOCK(bp); + kp = XFS_INOBT_KEY_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_inobt_log_ptrs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int pfirst, /* index of first pointer to log */ + int plast) /* index of last pointer to log */ +{ + xfs_inobt_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */ + + block = XFS_BUF_TO_INOBT_BLOCK(bp); + pp = XFS_INOBT_PTR_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Log records from a btree block (leaf). + */ +STATIC void +xfs_inobt_log_recs( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_buf_t *bp, /* buffer containing btree block */ + int rfirst, /* index of first record to log */ + int rlast) /* index of last record to log */ +{ + xfs_inobt_block_t *block; /* btree block to log from */ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + xfs_inobt_rec_t *rp; /* record pointer for btree block */ + + block = XFS_BUF_TO_INOBT_BLOCK(bp); + rp = XFS_INOBT_REC_ADDR(block, 1, cur); + first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); + last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * Return 0 if can't find any such record, 1 for success. + */ +STATIC int /* error */ +xfs_inobt_lookup( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + xfs_agblock_t agbno; /* a.g. relative btree block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_inobt_block_t *block=NULL; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno=0; /* current key number */ + int level; /* level in the btree */ + xfs_mount_t *mp; /* file system mount point */ + + /* + * Get the allocation group header, and the root block number. + */ + mp = cur->bc_mp; + { + xfs_agi_t *agi; /* a.g. inode header */ + + agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + agno = INT_GET(agi->agi_seqno, ARCH_CONVERT); + agbno = INT_GET(agi->agi_root, ARCH_CONVERT); + } + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + xfs_buf_t *bp; /* buffer pointer for btree block */ + xfs_daddr_t d; /* disk address of btree block */ + + /* + * Get the disk address we're looking for. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + /* + * If the old buffer at this level is for a different block, + * throw it away, otherwise just use it. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) != d) + bp = (xfs_buf_t *)0; + if (!bp) { + /* + * Need to get a new buffer. Read it, then + * set it in the cursor, releasing the old one. + */ + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, + agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) + return error; + xfs_btree_setbuf(cur, level, bp); + /* + * Point to the btree block, now that we have the buffer + */ + block = XFS_BUF_TO_INOBT_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, level, + bp))) + return error; + } else + block = XFS_BUF_TO_INOBT_BLOCK(bp); + /* + * If we already had a key match at a higher level, we know + * we need to use the first entry in this block. + */ + if (diff == 0) + keyno = 1; + /* + * Otherwise we need to search this block. Do a binary search. + */ + else { + int high; /* high entry number */ + xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */ + xfs_inobt_rec_t *krbase=NULL;/* base of records in block */ + int low; /* low entry number */ + + /* + * Get a pointer to keys or records. + */ + if (level > 0) + kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur); + else + krbase = XFS_INOBT_REC_ADDR(block, 1, cur); + /* + * Set low and high entry numbers, 1-based. + */ + low = 1; + if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) { + /* + * If the block is empty, the tree must + * be an empty leaf. + */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + *stat = 0; + return 0; + } + /* + * Binary search the block. + */ + while (low <= high) { + xfs_agino_t startino; /* key value */ + + /* + * keyno is average of low and high. + */ + keyno = (low + high) >> 1; + /* + * Get startino. + */ + if (level > 0) { + xfs_inobt_key_t *kkp; + + kkp = kkbase + keyno - 1; + startino = INT_GET(kkp->ir_startino, ARCH_CONVERT); + } else { + xfs_inobt_rec_t *krp; + + krp = krbase + keyno - 1; + startino = INT_GET(krp->ir_startino, ARCH_CONVERT); + } + /* + * Compute difference to get next direction. + */ + diff = (__int64_t) + startino - cur->bc_rec.i.ir_startino; + /* + * Less than, move right. + */ + if (diff < 0) + low = keyno + 1; + /* + * Greater than, move left. + */ + else if (diff > 0) + high = keyno - 1; + /* + * Equal, we're done. + */ + else + break; + } + } + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, keyno, cur), ARCH_CONVERT); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, agbno, level))) + return error; +#endif + cur->bc_ptrs[level] = keyno; + } + } + /* + * Done with the search. + * See if we need to adjust the results. + */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + if (dir == XFS_LOOKUP_GE && + keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) && + INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + int i; + + cur->bc_ptrs[0] = keyno; + if ((error = xfs_inobt_increment(cur, 0, &i))) + return error; + ASSERT(i == 1); + *stat = 1; + return 0; + } + } + else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + /* + * Return if we succeeded or not. + */ + if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) + *stat = 0; + else + *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); + return 0; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_inobt_lshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ +#ifdef DEBUG + int i; /* loop index */ +#endif + xfs_inobt_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left neighbor block */ + xfs_inobt_block_t *left; /* left neighbor btree block */ + xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */ + xfs_inobt_ptr_t *lpp; /* address pointer for left block */ + xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */ + int nrec; /* new number of left block entries */ + xfs_buf_t *rbp; /* buffer for right (current) block */ + xfs_inobt_block_t *right; /* right (current) btree block */ + xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */ + xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */ + xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ + + /* + * Set up variables for this block as "right". + */ + rbp = cur->bc_bufs[level]; + right = XFS_BUF_TO_INOBT_BLOCK(rbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; +#endif + /* + * If we've got no left sibling then we can't shift an entry left. + */ + if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) { + *stat = 0; + return 0; + } + /* + * Set up the left neighbor as "left". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.i.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp, + XFS_INO_BTREE_REF))) + return error; + left = XFS_BUF_TO_INOBT_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1; + /* + * If non-leaf, copy a key and a ptr to the left block. + */ + if (level > 0) { + lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur); + rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); + *lkp = *rkp; + xfs_inobt_log_keys(cur, lbp, nrec, nrec); + lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur); + rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) + return error; +#endif + *lpp = *rpp; /* INT_: no-change copy */ + xfs_inobt_log_ptrs(cur, lbp, nrec, nrec); + } + /* + * If leaf, copy a record to the left block. + */ + else { + lrp = XFS_INOBT_REC_ADDR(left, nrec, cur); + rrp = XFS_INOBT_REC_ADDR(right, 1, cur); + *lrp = *rrp; + xfs_inobt_log_recs(cur, lbp, nrec, nrec); + } + /* + * Bump and log left's numrecs, decrement and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1); + xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); + else + xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); +#endif + INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1); + xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Slide the contents of right down one entry. + */ + if (level > 0) { +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + level))) + return error; + } +#endif + memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + } else { + memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + rkp = &key; + } + /* + * Update the parent key values of right. + */ + if ((error = xfs_inobt_updkey(cur, rkp, level + 1))) + return error; + /* + * Slide the cursor value left one. + */ + cur->bc_ptrs[level]--; + *stat = 1; + return 0; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_inobt_newroot( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + xfs_agi_t *agi; /* a.g. inode header */ + xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_inobt_block_t *block; /* one half of the old root block */ + xfs_buf_t *bp; /* buffer containing block */ + int error; /* error return value */ + xfs_inobt_key_t *kp; /* btree key pointer */ + xfs_agblock_t lbno; /* left block number */ + xfs_buf_t *lbp; /* left buffer pointer */ + xfs_inobt_block_t *left; /* left btree block */ + xfs_buf_t *nbp; /* new (root) buffer */ + xfs_inobt_block_t *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + xfs_inobt_ptr_t *pp; /* btree address pointer */ + xfs_agblock_t rbno; /* right block number */ + xfs_buf_t *rbp; /* right buffer pointer */ + xfs_inobt_block_t *right; /* right btree block */ + xfs_inobt_rec_t *rp; /* btree record pointer */ + + ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp)); + + /* + * Get a block & a buffer. + */ + agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, + INT_GET(agi->agi_root, ARCH_CONVERT)); + args.mod = args.minleft = args.alignment = args.total = args.wasdel = + args.isfl = args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + if ((error = xfs_alloc_vextent(&args))) + return error; + /* + * None available, we fail. + */ + if (args.fsbno == NULLFSBLOCK) { + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); + new = XFS_BUF_TO_INOBT_BLOCK(nbp); + /* + * Set the root data in the a.g. inode structure. + */ + INT_SET(agi->agi_root, ARCH_CONVERT, args.agbno); + INT_MOD(agi->agi_level, ARCH_CONVERT, 1); + xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp, + XFS_AGI_ROOT | XFS_AGI_LEVEL); + /* + * At the previous root level there are now two blocks: the old + * root, and the new block generated when it was split. + * We don't know which one the cursor is pointing at, so we + * set up variables "left" and "right" for each case. + */ + bp = cur->bc_bufs[cur->bc_nlevels - 1]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp))) + return error; +#endif + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + /* + * Our block is left, pick up the right block. + */ + lbp = bp; + lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); + left = block; + rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, + rbno, 0, &rbp, XFS_INO_BTREE_REF))) + return error; + bp = rbp; + right = XFS_BUF_TO_INOBT_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, + cur->bc_nlevels - 1, rbp))) + return error; + nptr = 1; + } else { + /* + * Our block is right, pick up the left block. + */ + rbp = bp; + rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp)); + right = block; + lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, + lbno, 0, &lbp, XFS_INO_BTREE_REF))) + return error; + bp = lbp; + left = XFS_BUF_TO_INOBT_BLOCK(lbp); + if ((error = xfs_btree_check_sblock(cur, left, + cur->bc_nlevels - 1, lbp))) + return error; + nptr = 2; + } + /* + * Fill in the new block's btree header and log it. + */ + INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels); + INT_SET(new->bb_numrecs, ARCH_CONVERT, 2); + INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK); + INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK); + xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS); + ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); + /* + * Fill in the key data in the new root. + */ + kp = XFS_INOBT_KEY_ADDR(new, 1, cur); + if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) { + kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */ + kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */ + } else { + rp = XFS_INOBT_REC_ADDR(left, 1, cur); + INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT); + rp = XFS_INOBT_REC_ADDR(right, 1, cur); + INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT); + } + xfs_inobt_log_keys(cur, nbp, 1, 2); + /* + * Fill in the pointer data in the new root. + */ + pp = XFS_INOBT_PTR_ADDR(new, 1, cur); + INT_SET(pp[0], ARCH_CONVERT, lbno); + INT_SET(pp[1], ARCH_CONVERT, rbno); + xfs_inobt_log_ptrs(cur, nbp, 1, 2); + /* + * Fix up the cursor. + */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + *stat = 1; + return 0; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_inobt_rshift( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to shift record on */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* loop index */ + xfs_inobt_key_t key; /* key value for leaf level upward */ + xfs_buf_t *lbp; /* buffer for left (current) block */ + xfs_inobt_block_t *left; /* left (current) btree block */ + xfs_inobt_key_t *lkp; /* key pointer for left block */ + xfs_inobt_ptr_t *lpp; /* address pointer for left block */ + xfs_inobt_rec_t *lrp; /* record pointer for left block */ + xfs_buf_t *rbp; /* buffer for right neighbor block */ + xfs_inobt_block_t *right; /* right neighbor btree block */ + xfs_inobt_key_t *rkp; /* key pointer for right block */ + xfs_inobt_ptr_t *rpp; /* address pointer for right block */ + xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ + xfs_btree_cur_t *tcur; /* temporary cursor */ + + /* + * Set up variables for this block as "left". + */ + lbp = cur->bc_bufs[level]; + left = XFS_BUF_TO_INOBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * If we've got no right sibling then we can't shift an entry right. + */ + if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) { + *stat = 0; + return 0; + } + /* + * Set up the right neighbor as "right". + */ + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp, + XFS_INO_BTREE_REF))) + return error; + right = XFS_BUF_TO_INOBT_BLOCK(rbp); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) + return error; + /* + * If it's full, it can't take another entry. + */ + if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + *stat = 0; + return 0; + } + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); + rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); +#ifdef DEBUG + if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) + return error; +#endif + *rkp = *lkp; /* INT_: no change copy */ + *rpp = *lpp; /* INT_: no change copy */ + xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + } else { + lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur); + rrp = XFS_INOBT_REC_ADDR(right, 1, cur); + memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + *rrp = *lrp; + xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1); + key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + rkp = &key; + } + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1); + xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); +#ifdef DEBUG + if (level > 0) + xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); + else + xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); +#endif + xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + if ((error = xfs_btree_dup_cursor(cur, &tcur))) + return error; + xfs_btree_lastrec(tcur, level); + if ((error = xfs_inobt_increment(tcur, level, &i)) || + (error = xfs_inobt_updkey(tcur, rkp, level + 1))) { + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; + } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + *stat = 1; + return 0; +} + +/* + * Split cur/level block in half. + * Return new block number and its first record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_inobt_split( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level to split */ + xfs_agblock_t *bnop, /* output: block number allocated */ + xfs_inobt_key_t *keyp, /* output: first key of new block */ + xfs_btree_cur_t **curp, /* output: new cursor */ + int *stat) /* success/failure */ +{ + xfs_alloc_arg_t args; /* allocation argument structure */ + int error; /* error return value */ + int i; /* loop index/record number */ + xfs_agblock_t lbno; /* left (current) block number */ + xfs_buf_t *lbp; /* buffer for left block */ + xfs_inobt_block_t *left; /* left (current) btree block */ + xfs_inobt_key_t *lkp; /* left btree key pointer */ + xfs_inobt_ptr_t *lpp; /* left btree address pointer */ + xfs_inobt_rec_t *lrp; /* left btree record pointer */ + xfs_buf_t *rbp; /* buffer for right block */ + xfs_inobt_block_t *right; /* right (new) btree block */ + xfs_inobt_key_t *rkp; /* right btree key pointer */ + xfs_inobt_ptr_t *rpp; /* right btree address pointer */ + xfs_inobt_rec_t *rrp; /* right btree record pointer */ + + /* + * Set up left block (current one). + */ + lbp = cur->bc_bufs[level]; + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); + /* + * Allocate the new block. + * If we can't do it, we're toast. Give up. + */ + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno); + args.mod = args.minleft = args.alignment = args.total = args.wasdel = + args.isfl = args.userdata = args.minalignslop = 0; + args.minlen = args.maxlen = args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (args.fsbno == NULLFSBLOCK) { + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); + /* + * Set up the new block as "right". + */ + right = XFS_BUF_TO_INOBT_BLOCK(rbp); + /* + * "Left" is the current (according to the cursor) block. + */ + left = XFS_BUF_TO_INOBT_BLOCK(lbp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) + return error; +#endif + /* + * Fill in the btree header for the new block. + */ + INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]); + right->bb_level = left->bb_level; /* INT_: direct copy */ + INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2)); + /* + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) && + cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1) + INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1); + i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1; + /* + * For non-leaf blocks, copy keys and addresses over to the new block. + */ + if (level > 0) { + lkp = XFS_INOBT_KEY_ADDR(left, i, cur); + lpp = XFS_INOBT_PTR_ADDR(left, i, cur); + rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); + rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); +#ifdef DEBUG + for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) + return error; + } +#endif + memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); + memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); + xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + *keyp = *rkp; + } + /* + * For leaf blocks, copy records over to the new block. + */ + else { + lrp = XFS_INOBT_REC_ADDR(left, i, cur); + rrp = XFS_INOBT_REC_ADDR(right, 1, cur); + memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp)); + xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */ + } + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT))); + right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */ + INT_SET(left->bb_rightsib, ARCH_CONVERT, args.agbno); + INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno); + xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS); + xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) { + xfs_inobt_block_t *rrblock; /* rr btree block */ + xfs_buf_t *rrbp; /* buffer for rrblock */ + + if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, + INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp, + XFS_INO_BTREE_REF))) + return error; + rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) + return error; + INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.agbno); + xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT); + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + if ((error = xfs_btree_dup_cursor(cur, curp))) + return error; + (*curp)->bc_ptrs[level + 1]++; + } + *bnop = args.agbno; + *stat = 1; + return 0; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int /* error */ +xfs_inobt_updkey( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_inobt_key_t *keyp, /* new key value to update to */ + int level) /* starting level for update */ +{ + int ptr; /* index of key in block */ + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + xfs_buf_t *bp; /* buffer for block */ + xfs_inobt_block_t *block; /* btree block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + xfs_inobt_key_t *kp; /* ptr to btree block keys */ + + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + ptr = cur->bc_ptrs[level]; + kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); + *kp = *keyp; + xfs_inobt_log_keys(cur, bp, ptr, ptr); + } + return 0; +} + +/* + * Externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_inobt_decrement( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_inobt_block_t *block; /* btree block */ + int error; + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the left at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + /* + * Decrement the ptr at this level. If we're still in the block + * then we're done. + */ + if (--cur->bc_ptrs[level] > 0) { + *stat = 1; + return 0; + } + /* + * Get a pointer to the btree block. + */ + block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, + cur->bc_bufs[level]))) + return error; +#endif + /* + * If we just went off the left edge of the tree, return failure. + */ + if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* + * Read-ahead the left block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + xfs_buf_t *bp; /* buffer containing btree block */ + + agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.i.agno, agbno, 0, &bp, + XFS_INO_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_INOBT_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT); + } + *stat = 1; + return 0; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_inobt_delete( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; + int i; /* result code */ + int level; /* btree level */ + + /* + * Go up the tree, starting at leaf level. + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + if ((error = xfs_inobt_delrec(cur, level, &i))) + return error; + } + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + if ((error = xfs_inobt_decrement(cur, level, &i))) + return error; + break; + } + } + } + *stat = i; + return 0; +} + + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agino_t *ino, /* output: starting inode of chunk */ + __int32_t *fcnt, /* output: number of free inodes */ + xfs_inofree_t *free, /* output: free inode mask */ + int *stat) /* output: success/failure */ +{ + xfs_inobt_block_t *block; /* btree block */ + xfs_buf_t *bp; /* buffer containing btree block */ +#ifdef DEBUG + int error; /* error return value */ +#endif + int ptr; /* record number */ + xfs_inobt_rec_t *rec; /* record data */ + + bp = cur->bc_bufs[0]; + ptr = cur->bc_ptrs[0]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) + return error; +#endif + /* + * Off the right end or left end, return failure. + */ + if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) { + *stat = 0; + return 0; + } + /* + * Point to the record and extract its data. + */ + rec = XFS_INOBT_REC_ADDR(block, ptr, cur); + *ino = INT_GET(rec->ir_startino, ARCH_CONVERT); + *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT); + *free = INT_GET(rec->ir_free, ARCH_CONVERT); + *stat = 1; + return 0; +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_inobt_increment( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat) /* success/failure */ +{ + xfs_inobt_block_t *block; /* btree block */ + xfs_buf_t *bp; /* buffer containing btree block */ + int error; /* error return value */ + int lev; /* btree level */ + + ASSERT(level < cur->bc_nlevels); + /* + * Read-ahead to the right at this level. + */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + /* + * Get a pointer to the btree block. + */ + bp = cur->bc_bufs[level]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + return error; +#endif + /* + * Increment the ptr at this level. If we're still in the block + * then we're done. + */ + if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + *stat = 1; + return 0; + } + /* + * If we just went off the right edge of the tree, return failure. + */ + if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) { + *stat = 0; + return 0; + } + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + bp = cur->bc_bufs[lev]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; +#endif + if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) + break; + /* + * Read-ahead the right block, we're going to read it + * in the next loop. + */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + /* + * If we went off the root then we are seriously confused. + */ + ASSERT(lev < cur->bc_nlevels); + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp); + lev > level; ) { + xfs_agblock_t agbno; /* block number of btree block */ + + agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + cur->bc_private.i.agno, agbno, 0, &bp, + XFS_INO_BTREE_REF))) + return error; + lev--; + xfs_btree_setbuf(cur, lev, bp); + block = XFS_BUF_TO_INOBT_BLOCK(bp); + if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) + return error; + cur->bc_ptrs[lev] = 1; + } + *stat = 1; + return 0; +} + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_inobt_insert( + xfs_btree_cur_t *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + xfs_agblock_t nbno; /* new block number (split result) */ + xfs_btree_cur_t *ncur; /* new cursor (split result) */ + xfs_inobt_rec_t nrec; /* record being inserted this level */ + xfs_btree_cur_t *pcur; /* previous level's cursor */ + + level = 0; + nbno = NULLAGBLOCK; + INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino); + INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount); + INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free); + ncur = (xfs_btree_cur_t *)0; + pcur = cur; + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nbno into this level of the tree. + * Note if we fail, nbno will be null. + */ + if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur, + &i))) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + return error; + } + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* + * If we got a new cursor, switch to it. + */ + if (ncur) { + pcur = ncur; + ncur = (xfs_btree_cur_t *)0; + } + } while (nbno != NULLAGBLOCK); + *stat = i; + return 0; +} + +/* + * Lookup the record equal to ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_eq( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_ge( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_le( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur, to the value given + * by [ino, fcnt, free]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_inobt_update( + xfs_btree_cur_t *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free) /* free inode mask */ +{ + xfs_inobt_block_t *block; /* btree block to update */ + xfs_buf_t *bp; /* buffer containing btree block */ + int error; /* error return value */ + int ptr; /* current record number (updating) */ + xfs_inobt_rec_t *rp; /* pointer to updated record */ + + /* + * Pick up the current block. + */ + bp = cur->bc_bufs[0]; + block = XFS_BUF_TO_INOBT_BLOCK(bp); +#ifdef DEBUG + if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) + return error; +#endif + /* + * Get the address of the rec to be updated. + */ + ptr = cur->bc_ptrs[0]; + rp = XFS_INOBT_REC_ADDR(block, ptr, cur); + /* + * Fill in the new contents and log them. + */ + INT_SET(rp->ir_startino, ARCH_CONVERT, ino); + INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt); + INT_SET(rp->ir_free, ARCH_CONVERT, free); + xfs_inobt_log_recs(cur, bp, ptr, ptr); + /* + * Updating first record in leaf. Pass new key value up to our parent. + */ + if (ptr == 1) { + xfs_inobt_key_t key; /* key containing [ino] */ + + INT_SET(key.ir_startino, ARCH_CONVERT, ino); + if ((error = xfs_inobt_updkey(cur, &key, 1))) + return error; + } + return 0; +} diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h new file mode 100644 index 00000000000..803c4d17a05 --- /dev/null +++ b/fs/xfs/xfs_ialloc_btree.h @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IALLOC_BTREE_H__ +#define __XFS_IALLOC_BTREE_H__ + +/* + * Inode map on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_btree_sblock; +struct xfs_mount; + +/* + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASKN) +xfs_inofree_t xfs_inobt_maskn(int i, int n); +#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n) +#else +#define XFS_INOBT_MASKN(i,n) \ + ((((n) >= XFS_INODES_PER_CHUNK ? \ + (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i)) +#endif + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec +{ + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +/* + * Key structure + */ +typedef struct xfs_inobt_key +{ + xfs_agino_t ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +typedef xfs_agblock_t xfs_inobt_ptr_t; /* btree pointer type */ + /* btree block header type */ +typedef struct xfs_btree_sblock xfs_inobt_block_t; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_INOBT_BLOCK) +xfs_inobt_block_t *xfs_buf_to_inobt_block(struct xfs_buf *bp); +#define XFS_BUF_TO_INOBT_BLOCK(bp) xfs_buf_to_inobt_block(bp) +#else +#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)(XFS_BUF_PTR(bp))) +#endif + +/* + * Bit manipulations for ir_free. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASK) +xfs_inofree_t xfs_inobt_mask(int i); +#define XFS_INOBT_MASK(i) xfs_inobt_mask(i) +#else +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE) +int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i); +#define XFS_INOBT_IS_FREE(rp,i) xfs_inobt_is_free(rp,i) +#else +#define XFS_INOBT_IS_FREE(rp,i) (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE) +void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i); +#define XFS_INOBT_SET_FREE(rp,i) xfs_inobt_set_free(rp,i) +#else +#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_CLR_FREE) +void xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i); +#define XFS_INOBT_CLR_FREE(rp,i) xfs_inobt_clr_free(rp,i) +#else +#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) +#endif + +/* + * Real block structures have a size equal to the disk block size. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_SIZE) +int xfs_inobt_block_size(int lev, struct xfs_btree_cur *cur); +#define XFS_INOBT_BLOCK_SIZE(lev,cur) xfs_inobt_block_size(lev,cur) +#else +#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MAXRECS) +int xfs_inobt_block_maxrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) xfs_inobt_block_maxrecs(lev,cur) +#else +#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) \ + ((cur)->bc_mp->m_inobt_mxr[lev != 0]) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MINRECS) +int xfs_inobt_block_minrecs(int lev, struct xfs_btree_cur *cur); +#define XFS_INOBT_BLOCK_MINRECS(lev,cur) xfs_inobt_block_minrecs(lev,cur) +#else +#define XFS_INOBT_BLOCK_MINRECS(lev,cur) \ + ((cur)->bc_mp->m_inobt_mnr[lev != 0]) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_LAST_REC) +int xfs_inobt_is_last_rec(struct xfs_btree_cur *cur); +#define XFS_INOBT_IS_LAST_REC(cur) xfs_inobt_is_last_rec(cur) +#else +#define XFS_INOBT_IS_LAST_REC(cur) \ + ((cur)->bc_ptrs[0] == \ + INT_GET(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs, ARCH_CONVERT)) +#endif + +/* + * Maximum number of inode btree levels. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IN_MAXLEVELS) +int xfs_in_maxlevels(struct xfs_mount *mp); +#define XFS_IN_MAXLEVELS(mp) xfs_in_maxlevels(mp) +#else +#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) +#endif + +/* + * block numbers in the AG. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IBT_BLOCK) +xfs_agblock_t xfs_ibt_block(struct xfs_mount *mp); +#define XFS_IBT_BLOCK(mp) xfs_ibt_block(mp) +#else +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_PREALLOC_BLOCKS) +xfs_agblock_t xfs_prealloc_blocks(struct xfs_mount *mp); +#define XFS_PREALLOC_BLOCKS(mp) xfs_prealloc_blocks(mp) +#else +#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) +#endif + +/* + * Record, key, and pointer address macros for btree blocks. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_REC_ADDR) +xfs_inobt_rec_t * +xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_INOBT_REC_ADDR(bb,i,cur) xfs_inobt_rec_addr(bb,i,cur) +#else +#define XFS_INOBT_REC_ADDR(bb,i,cur) \ + XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, i, \ + XFS_INOBT_BLOCK_MAXRECS(0, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_KEY_ADDR) +xfs_inobt_key_t * +xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_INOBT_KEY_ADDR(bb,i,cur) xfs_inobt_key_addr(bb,i,cur) +#else +#define XFS_INOBT_KEY_ADDR(bb,i,cur) \ + XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \ + XFS_INOBT_BLOCK_MAXRECS(1, cur)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_PTR_ADDR) +xfs_inobt_ptr_t * +xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur); +#define XFS_INOBT_PTR_ADDR(bb,i,cur) xfs_inobt_ptr_addr(bb,i,cur) +#else +#define XFS_INOBT_PTR_ADDR(bb,i,cur) \ + XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \ + XFS_INOBT_BLOCK_MAXRECS(1, cur)) +#endif + +/* + * Prototypes for externally visible routines. + */ + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_inobt_decrement( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_inobt_delete( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t *ino, /* output: starting inode of chunk */ + __int32_t *fcnt, /* output: number of free inodes */ + xfs_inofree_t *free, /* output: free inode mask */ + int *stat); /* output: success/failure */ + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_inobt_increment( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree, 0 is leaf */ + int *stat); /* success/failure */ + +/* + * Insert the current record at the point referenced by cur. + * The cursor may be inconsistent on return if splits have been done. + */ +int /* error */ +xfs_inobt_insert( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat); /* success/failure */ + +/* + * Lookup the record equal to ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat); /* success/failure */ + +/* + * Lookup the first record greater than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat); /* success/failure */ + +/* + * Lookup the first record less than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat); /* success/failure */ + +/* + * Update the record referred to by cur, to the value given + * by [ino, fcnt, free]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free); /* free inode mask */ + +#endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c new file mode 100644 index 00000000000..3a0ba1dfd0e --- /dev/null +++ b/fs/xfs/xfs_iget.c @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_bit.h" + +/* + * Initialize the inode hash table for the newly mounted file system. + * Choose an initial table size based on user specified value, else + * use a simple algorithm using the maximum number of inodes as an + * indicator for table size, and clamp it between one and some large + * number of pages. + */ +void +xfs_ihash_init(xfs_mount_t *mp) +{ + __uint64_t icount; + uint i, flags = KM_SLEEP | KM_MAYFAIL; + + if (!mp->m_ihsize) { + icount = mp->m_maxicount ? mp->m_maxicount : + (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog); + mp->m_ihsize = 1 << max_t(uint, 8, + (xfs_highbit64(icount) + 1) / 2); + mp->m_ihsize = min_t(uint, mp->m_ihsize, + (64 * NBPP) / sizeof(xfs_ihash_t)); + } + + while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize * + sizeof(xfs_ihash_t), flags))) { + if ((mp->m_ihsize >>= 1) <= NBPP) + flags = KM_SLEEP; + } + for (i = 0; i < mp->m_ihsize; i++) { + rwlock_init(&(mp->m_ihash[i].ih_lock)); + } +} + +/* + * Free up structures allocated by xfs_ihash_init, at unmount time. + */ +void +xfs_ihash_free(xfs_mount_t *mp) +{ + kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t)); + mp->m_ihash = NULL; +} + +/* + * Initialize the inode cluster hash table for the newly mounted file system. + * Its size is derived from the ihash table size. + */ +void +xfs_chash_init(xfs_mount_t *mp) +{ + uint i; + + mp->m_chsize = max_t(uint, 1, mp->m_ihsize / + (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)); + mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); + mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize + * sizeof(xfs_chash_t), + KM_SLEEP); + for (i = 0; i < mp->m_chsize; i++) { + spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); + } +} + +/* + * Free up structures allocated by xfs_chash_init, at unmount time. + */ +void +xfs_chash_free(xfs_mount_t *mp) +{ + int i; + + for (i = 0; i < mp->m_chsize; i++) { + spinlock_destroy(&mp->m_chash[i].ch_lock); + } + + kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t)); + mp->m_chash = NULL; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the hash table for the file system + * represented by the mount point parameter mp. Each bucket of + * the hash table is guarded by an individual semaphore. + * + * If the inode is found in the hash table, its corresponding vnode + * is obtained with a call to vn_get(). This call takes care of + * coordination with the reclamation of the inode and vnode. Note + * that the vmap structure is filled in while holding the hash lock. + * This gives us the state of the inode/vnode when we found it and + * is used for coordination in vn_get(). + * + * If it is not in core, read it in from the file system's device and + * add the inode into the hash table. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + * bno -- the block number starting the buffer containing the inode, + * if known (as by bulkstat), else 0. + */ +STATIC int +xfs_iget_core( + vnode_t *vp, + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp, + xfs_daddr_t bno) +{ + xfs_ihash_t *ih; + xfs_inode_t *ip; + xfs_inode_t *iq; + vnode_t *inode_vp; + ulong version; + int error; + /* REFERENCED */ + xfs_chash_t *ch; + xfs_chashlist_t *chl, *chlnew; + SPLDECL(s); + + + ih = XFS_IHASH(mp, ino); + +again: + read_lock(&ih->ih_lock); + + for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { + if (ip->i_ino == ino) { + /* + * If INEW is set this inode is being set up + * we need to pause and try again. + */ + if (ip->i_flags & XFS_INEW) { + read_unlock(&ih->ih_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); + + goto again; + } + + inode_vp = XFS_ITOV_NULL(ip); + if (inode_vp == NULL) { + /* + * If IRECLAIM is set this inode is + * on its way out of the system, + * we need to pause and try again. + */ + if (ip->i_flags & XFS_IRECLAIM) { + read_unlock(&ih->ih_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); + + goto again; + } + + vn_trace_exit(vp, "xfs_iget.alloc", + (inst_t *)__return_address); + + XFS_STATS_INC(xs_ig_found); + + ip->i_flags &= ~XFS_IRECLAIMABLE; + read_unlock(&ih->ih_lock); + + XFS_MOUNT_ILOCK(mp); + list_del_init(&ip->i_reclaim); + XFS_MOUNT_IUNLOCK(mp); + + goto finish_inode; + + } else if (vp != inode_vp) { + struct inode *inode = LINVFS_GET_IP(inode_vp); + + /* The inode is being torn down, pause and + * try again. + */ + if (inode->i_state & (I_FREEING | I_CLEAR)) { + read_unlock(&ih->ih_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); + + goto again; + } +/* Chances are the other vnode (the one in the inode) is being torn + * down right now, and we landed on top of it. Question is, what do + * we do? Unhook the old inode and hook up the new one? + */ + cmn_err(CE_PANIC, + "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", + inode_vp, vp); + } + + read_unlock(&ih->ih_lock); + + XFS_STATS_INC(xs_ig_found); + +finish_inode: + if (ip->i_d.di_mode == 0) { + if (!(flags & IGET_CREATE)) + return ENOENT; + xfs_iocore_inode_reinit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + ip->i_flags &= ~XFS_ISTALE; + + vn_trace_exit(vp, "xfs_iget.found", + (inst_t *)__return_address); + goto return_ip; + } + } + + /* + * Inode cache miss: save the hash chain version stamp and unlock + * the chain, so we don't deadlock in vn_alloc. + */ + XFS_STATS_INC(xs_ig_missed); + + version = ih->ih_version; + + read_unlock(&ih->ih_lock); + + /* + * Read the disk inode attributes into a new inode structure and get + * a new vnode for it. This should also initialize i_ino and i_mount. + */ + error = xfs_iread(mp, tp, ino, &ip, bno); + if (error) { + return error; + } + + vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); + + xfs_inode_lock_init(ip, vp); + xfs_iocore_inode_init(ip); + + if (lock_flags != 0) { + xfs_ilock(ip, lock_flags); + } + + if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) { + xfs_idestroy(ip); + return ENOENT; + } + + /* + * Put ip on its hash chain, unless someone else hashed a duplicate + * after we released the hash lock. + */ + write_lock(&ih->ih_lock); + + if (ih->ih_version != version) { + for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { + if (iq->i_ino == ino) { + write_unlock(&ih->ih_lock); + xfs_idestroy(ip); + + XFS_STATS_INC(xs_ig_dup); + goto again; + } + } + } + + /* + * These values _must_ be set before releasing ihlock! + */ + ip->i_hash = ih; + if ((iq = ih->ih_next)) { + iq->i_prevp = &ip->i_next; + } + ip->i_next = iq; + ip->i_prevp = &ih->ih_next; + ih->ih_next = ip; + ip->i_udquot = ip->i_gdquot = NULL; + ih->ih_version++; + ip->i_flags |= XFS_INEW; + + write_unlock(&ih->ih_lock); + + /* + * put ip on its cluster's hash chain + */ + ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL && + ip->i_cnext == NULL); + + chlnew = NULL; + ch = XFS_CHASH(mp, ip->i_blkno); + chlredo: + s = mutex_spinlock(&ch->ch_lock); + for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { + if (chl->chl_blkno == ip->i_blkno) { + + /* insert this inode into the doubly-linked list + * where chl points */ + if ((iq = chl->chl_ip)) { + ip->i_cprev = iq->i_cprev; + iq->i_cprev->i_cnext = ip; + iq->i_cprev = ip; + ip->i_cnext = iq; + } else { + ip->i_cnext = ip; + ip->i_cprev = ip; + } + chl->chl_ip = ip; + ip->i_chash = chl; + break; + } + } + + /* no hash list found for this block; add a new hash list */ + if (chl == NULL) { + if (chlnew == NULL) { + mutex_spinunlock(&ch->ch_lock, s); + ASSERT(xfs_chashlist_zone != NULL); + chlnew = (xfs_chashlist_t *) + kmem_zone_alloc(xfs_chashlist_zone, + KM_SLEEP); + ASSERT(chlnew != NULL); + goto chlredo; + } else { + ip->i_cnext = ip; + ip->i_cprev = ip; + ip->i_chash = chlnew; + chlnew->chl_ip = ip; + chlnew->chl_blkno = ip->i_blkno; + chlnew->chl_next = ch->ch_list; + ch->ch_list = chlnew; + chlnew = NULL; + } + } else { + if (chlnew != NULL) { + kmem_zone_free(xfs_chashlist_zone, chlnew); + } + } + + mutex_spinunlock(&ch->ch_lock, s); + + + /* + * Link ip to its mount and thread it on the mount's inode list. + */ + XFS_MOUNT_ILOCK(mp); + if ((iq = mp->m_inodes)) { + ASSERT(iq->i_mprev->i_mnext == iq); + ip->i_mprev = iq->i_mprev; + iq->i_mprev->i_mnext = ip; + iq->i_mprev = ip; + ip->i_mnext = iq; + } else { + ip->i_mnext = ip; + ip->i_mprev = ip; + } + mp->m_inodes = ip; + + XFS_MOUNT_IUNLOCK(mp); + + return_ip: + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); + + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1); + + return 0; +} + + +/* + * The 'normal' internal xfs_iget, if needed it will + * 'allocate', or 'get', the vnode. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp, + xfs_daddr_t bno) +{ + struct inode *inode; + vnode_t *vp = NULL; + int error; + +retry: + XFS_STATS_INC(xs_ig_attempts); + + if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) { + bhv_desc_t *bdp; + xfs_inode_t *ip; + int newnode; + + vp = LINVFS_GET_VP(inode); + if (inode->i_state & I_NEW) { +inode_allocate: + vn_initialize(inode); + error = xfs_iget_core(vp, mp, tp, ino, flags, + lock_flags, ipp, bno); + if (error) { + vn_mark_bad(vp); + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + iput(inode); + } + } else { + /* These are true if the inode is in inactive or + * reclaim. The linux inode is about to go away, + * wait for that path to finish, and try again. + */ + if (vp->v_flag & (VINACT | VRECLM)) { + vn_wait(vp); + iput(inode); + goto retry; + } + + if (is_bad_inode(inode)) { + iput(inode); + return EIO; + } + + bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); + if (bdp == NULL) { + XFS_STATS_INC(xs_ig_dup); + goto inode_allocate; + } + ip = XFS_BHVTOI(bdp); + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + newnode = (ip->i_d.di_mode == 0); + if (newnode) + xfs_iocore_inode_reinit(ip); + XFS_STATS_INC(xs_ig_found); + *ipp = ip; + error = 0; + } + } else + error = ENOMEM; /* If we got no inode we are out of memory */ + + return error; +} + +/* + * Do the setup for the various locks within the incore inode. + */ +void +xfs_inode_lock_init( + xfs_inode_t *ip, + vnode_t *vp) +{ + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", (long)vp->v_number); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number); + init_waitqueue_head(&ip->i_ipin_wait); + atomic_set(&ip->i_pincount, 0); + init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number); +} + +/* + * Look for the inode corresponding to the given ino in the hash table. + * If it is there and its i_transp pointer matches tp, return it. + * Otherwise, return NULL. + */ +xfs_inode_t * +xfs_inode_incore(xfs_mount_t *mp, + xfs_ino_t ino, + xfs_trans_t *tp) +{ + xfs_ihash_t *ih; + xfs_inode_t *ip; + + ih = XFS_IHASH(mp, ino); + read_lock(&ih->ih_lock); + for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { + if (ip->i_ino == ino) { + /* + * If we find it and tp matches, return it. + * Otherwise break from the loop and return + * NULL. + */ + if (ip->i_transp == tp) { + read_unlock(&ih->ih_lock); + return (ip); + } + break; + } + } + read_unlock(&ih->ih_lock); + return (NULL); +} + +/* + * Decrement reference count of an inode structure and unlock it. + * + * ip -- the inode being released + * lock_flags -- this parameter indicates the inode's locks to be + * to be released. See the comment on xfs_iunlock() for a list + * of valid values. + */ +void +xfs_iput(xfs_inode_t *ip, + uint lock_flags) +{ + vnode_t *vp = XFS_ITOV(ip); + + vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address); + + xfs_iunlock(ip, lock_flags); + + VN_RELE(vp); +} + +/* + * Special iput for brand-new inodes that are still locked + */ +void +xfs_iput_new(xfs_inode_t *ip, + uint lock_flags) +{ + vnode_t *vp = XFS_ITOV(ip); + struct inode *inode = LINVFS_GET_IP(vp); + + vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address); + + if ((ip->i_d.di_mode == 0)) { + ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE)); + vn_mark_bad(vp); + } + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + if (lock_flags) + xfs_iunlock(ip, lock_flags); + VN_RELE(vp); +} + + +/* + * This routine embodies the part of the reclaim code that pulls + * the inode from the inode hash table and the mount structure's + * inode list. + * This should only be called from xfs_reclaim(). + */ +void +xfs_ireclaim(xfs_inode_t *ip) +{ + vnode_t *vp; + + /* + * Remove from old hash list and mount list. + */ + XFS_STATS_INC(xs_ig_reclaims); + + xfs_iextract(ip); + + /* + * Here we do a spurious inode lock in order to coordinate with + * xfs_sync(). This is because xfs_sync() references the inodes + * in the mount list without taking references on the corresponding + * vnodes. We make that OK here by ensuring that we wait until + * the inode is unlocked in xfs_sync() before we go ahead and + * free it. We get both the regular lock and the io lock because + * the xfs_sync() code may need to drop the regular one but will + * still hold the io lock. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Release dquots (and their references) if any. An inode may escape + * xfs_inactive and get here via vn_alloc->vn_reclaim path. + */ + XFS_QM_DQDETACH(ip->i_mount, ip); + + /* + * Pull our behavior descriptor from the vnode chain. + */ + vp = XFS_ITOV_NULL(ip); + if (vp) { + vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); + } + + /* + * Free all memory associated with the inode. + */ + xfs_idestroy(ip); +} + +/* + * This routine removes an about-to-be-destroyed inode from + * all of the lists in which it is located with the exception + * of the behavior chain. + */ +void +xfs_iextract( + xfs_inode_t *ip) +{ + xfs_ihash_t *ih; + xfs_inode_t *iq; + xfs_mount_t *mp; + xfs_chash_t *ch; + xfs_chashlist_t *chl, *chm; + SPLDECL(s); + + ih = ip->i_hash; + write_lock(&ih->ih_lock); + if ((iq = ip->i_next)) { + iq->i_prevp = ip->i_prevp; + } + *ip->i_prevp = iq; + write_unlock(&ih->ih_lock); + + /* + * Remove from cluster hash list + * 1) delete the chashlist if this is the last inode on the chashlist + * 2) unchain from list of inodes + * 3) point chashlist->chl_ip to 'chl_next' if to this inode. + */ + mp = ip->i_mount; + ch = XFS_CHASH(mp, ip->i_blkno); + s = mutex_spinlock(&ch->ch_lock); + + if (ip->i_cnext == ip) { + /* Last inode on chashlist */ + ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); + ASSERT(ip->i_chash != NULL); + chm=NULL; + for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { + if (chl->chl_blkno == ip->i_blkno) { + if (chm == NULL) { + /* first item on the list */ + ch->ch_list = chl->chl_next; + } else { + chm->chl_next = chl->chl_next; + } + kmem_zone_free(xfs_chashlist_zone, chl); + break; + } else { + ASSERT(chl->chl_ip != ip); + chm = chl; + } + } + ASSERT_ALWAYS(chl != NULL); + } else { + /* delete one inode from a non-empty list */ + iq = ip->i_cnext; + iq->i_cprev = ip->i_cprev; + ip->i_cprev->i_cnext = iq; + if (ip->i_chash->chl_ip == ip) { + ip->i_chash->chl_ip = iq; + } + ip->i_chash = __return_address; + ip->i_cprev = __return_address; + ip->i_cnext = __return_address; + } + mutex_spinunlock(&ch->ch_lock, s); + + /* + * Remove from mount's inode list. + */ + XFS_MOUNT_ILOCK(mp); + ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL)); + iq = ip->i_mnext; + iq->i_mprev = ip->i_mprev; + ip->i_mprev->i_mnext = iq; + + /* + * Fix up the head pointer if it points to the inode being deleted. + */ + if (mp->m_inodes == ip) { + if (ip == iq) { + mp->m_inodes = NULL; + } else { + mp->m_inodes = iq; + } + } + + /* Deal with the deleted inodes list */ + list_del_init(&ip->i_reclaim); + + mp->m_ireclaims++; + XFS_MOUNT_IUNLOCK(mp); +} + +/* + * This is a wrapper routine around the xfs_ilock() routine + * used to centralize some grungy code. It is used in places + * that wish to lock the inode solely for reading the extents. + * The reason these places can't just call xfs_ilock(SHARED) + * is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode + * is in b-tree format, then we need to lock the inode exclusively + * until the extents are read in. Locking it exclusively all + * the time would limit our parallelism unnecessarily, though. + * What we do instead is check to see if the extents have been + * read in yet, and only lock the inode exclusively if they + * have not. + * + * The function returns a value which should be given to the + * corresponding xfs_iunlock_map_shared(). This value is + * the mode in which the lock was actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock(xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + mrupdate(&ip->i_iolock); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + mraccess(&ip->i_iolock); + } + if (lock_flags & XFS_ILOCK_EXCL) { + mrupdate(&ip->i_lock); + } else if (lock_flags & XFS_ILOCK_SHARED) { + mraccess(&ip->i_lock); + } + xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + * + */ +int +xfs_ilock_nowait(xfs_inode_t *ip, + uint lock_flags) +{ + int iolocked; + int ilocked; + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); + + iolocked = 0; + if (lock_flags & XFS_IOLOCK_EXCL) { + iolocked = mrtryupdate(&ip->i_iolock); + if (!iolocked) { + return 0; + } + } else if (lock_flags & XFS_IOLOCK_SHARED) { + iolocked = mrtryaccess(&ip->i_iolock); + if (!iolocked) { + return 0; + } + } + if (lock_flags & XFS_ILOCK_EXCL) { + ilocked = mrtryupdate(&ip->i_lock); + if (!ilocked) { + if (iolocked) { + mrunlock(&ip->i_iolock); + } + return 0; + } + } else if (lock_flags & XFS_ILOCK_SHARED) { + ilocked = mrtryaccess(&ip->i_lock); + if (!ilocked) { + if (iolocked) { + mrunlock(&ip->i_iolock); + } + return 0; + } + } + xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); + return 1; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock(xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) || + (ismrlocked(&ip->i_iolock, MR_ACCESS))); + ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) || + (ismrlocked(&ip->i_iolock, MR_UPDATE))); + mrunlock(&ip->i_iolock); + } + + if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) { + ASSERT(!(lock_flags & XFS_ILOCK_SHARED) || + (ismrlocked(&ip->i_lock, MR_ACCESS))); + ASSERT(!(lock_flags & XFS_ILOCK_EXCL) || + (ismrlocked(&ip->i_lock, MR_UPDATE))); + mrunlock(&ip->i_lock); + + /* + * Let the AIL know that this item has been unlocked in case + * it is in the AIL and anyone is waiting on it. Don't do + * this if the caller has asked us not to. + */ + if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) && + ip->i_itemp != NULL) { + xfs_trans_unlocked_item(ip->i_mount, + (xfs_log_item_t*)(ip->i_itemp)); + } + } + xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote(xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) { + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + mrdemote(&ip->i_lock); + } + if (lock_flags & XFS_IOLOCK_EXCL) { + ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + mrdemote(&ip->i_iolock); + } +} + +/* + * The following three routines simply manage the i_flock + * semaphore embedded in the inode. This semaphore synchronizes + * processes attempting to flush the in-core inode back to disk. + */ +void +xfs_iflock(xfs_inode_t *ip) +{ + psema(&(ip->i_flock), PINOD|PLTWAIT); +} + +int +xfs_iflock_nowait(xfs_inode_t *ip) +{ + return (cpsema(&(ip->i_flock))); +} + +void +xfs_ifunlock(xfs_inode_t *ip) +{ + ASSERT(valusema(&(ip->i_flock)) <= 0); + vsema(&(ip->i_flock)); +} diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h new file mode 100644 index 00000000000..e385064a066 --- /dev/null +++ b/fs/xfs/xfs_imap.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IMAP_H__ +#define __XFS_IMAP_H__ + +/* + * This is the structure passed to xfs_imap() to map + * an inode number to its on disk location. + */ +typedef struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + uint im_len; /* length in BBs of inode chunk */ + xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */ + ushort im_ioffset; /* inode offset in block in "inodes" */ + ushort im_boffset; /* inode offset in block in bytes */ +} xfs_imap_t; + +#ifdef __KERNEL__ +struct xfs_mount; +struct xfs_trans; +int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, + xfs_imap_t *, uint); +#endif + +#endif /* __XFS_IMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c new file mode 100644 index 00000000000..43c632ab86a --- /dev/null +++ b/fs/xfs/xfs_inode.c @@ -0,0 +1,3876 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_imap.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" +#include "xfs_rw.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_utils.h" +#include "xfs_dir2_trace.h" +#include "xfs_quota.h" +#include "xfs_mac.h" +#include "xfs_acl.h" + + +kmem_zone_t *xfs_ifork_zone; +kmem_zone_t *xfs_inode_zone; +kmem_zone_t *xfs_chashlist_zone; + +/* + * Used in xfs_itruncate(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +STATIC void +xfs_validate_extents( + xfs_bmbt_rec_t *ep, + int nrecs, + int disk, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + rec.l0 = get_unaligned((__uint64_t*)&ep->l0); + rec.l1 = get_unaligned((__uint64_t*)&ep->l1); + if (disk) + xfs_bmbt_disk_get_all(&rec, &irec); + else + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + ep++; + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ep, nrecs, disk, fmt) +#endif /* DEBUG */ + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_fs_cmn_err(CE_ALERT, mp, + "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", + bp); + ASSERT(dip->di_next_unlinked); + } + } +} +#endif + +/* + * called from bwrite on xfs inode buffers + */ +void +xfs_inobp_bwcheck(xfs_buf_t *bp) +{ + xfs_mount_t *mp; + int i; + int j; + xfs_dinode_t *dip; + + ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL); + + mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); + + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *) xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { + cmn_err(CE_WARN, +"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x", + INT_GET(dip->di_core.di_magic, ARCH_CONVERT), + (__uint64_t)(__psunsigned_t) bp, + (__int64_t) XFS_BUF_ADDR(bp), + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + xfs_fs_cmn_err(CE_WARN, mp, + "corrupt, unmount and run xfs_repair"); + } + if (!dip->di_next_unlinked) { + cmn_err(CE_WARN, +"Bad next_unlinked field (0) in XFS inode buffer 0x%p, starting blockno %Ld, offset 0x%x", + (__uint64_t)(__psunsigned_t) bp, + (__int64_t) XFS_BUF_ADDR(bp), + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + xfs_fs_cmn_err(CE_WARN, mp, + "corrupt, unmount and run xfs_repair"); + } + } + + return; +} + +/* + * This routine is called to map an inode number within a file + * system to the buffer containing the on-disk version of the + * inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dip parameter + * it returns a pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and + * dipp are undefined. + * + * Use xfs_imap() to determine the size and location of the + * buffer to read from disk. + */ +int +xfs_inotobp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + xfs_dinode_t **dipp, + xfs_buf_t **bpp, + int *offset) +{ + int di_ok; + xfs_imap_t imap; + xfs_buf_t *bp; + int error; + xfs_dinode_t *dip; + + /* + * Call the space managment code to find the location of the + * inode on disk. + */ + imap.im_blkno = 0; + error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); + if (error != 0) { + cmn_err(CE_WARN, + "xfs_inotobp: xfs_imap() returned an " + "error %d on %s. Returning error.", error, mp->m_fsname); + return error; + } + + /* + * If the inode number maps to a block outside the bounds of the + * file system then return NULL rather than calling read_buf + * and panicing when we get an error from the driver. + */ + if ((imap.im_blkno + imap.im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + cmn_err(CE_WARN, + "xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds " + "of the file system %s. Returning EINVAL.", + imap.im_blkno, imap.im_len,mp->m_fsname); + return XFS_ERROR(EINVAL); + } + + /* + * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will + * default to just a read_buf() call. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, + (int)imap.im_len, XFS_BUF_LOCK, &bp); + + if (error) { + cmn_err(CE_WARN, + "xfs_inotobp: xfs_trans_read_buf() returned an " + "error %d on %s. Returning error.", error, mp->m_fsname); + return error; + } + dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); + di_ok = + INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); + xfs_trans_brelse(tp, bp); + cmn_err(CE_WARN, + "xfs_inotobp: XFS_TEST_ERROR() returned an " + "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); + return XFS_ERROR(EFSCORRUPTED); + } + + xfs_inobp_check(mp, bp); + + /* + * Set *dipp to point to the on-disk inode in the buffer. + */ + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + *bpp = bp; + *offset = imap.im_boffset; + return 0; +} + + +/* + * This routine is called to map an inode to the buffer containing + * the on-disk version of the inode. It returns a pointer to the + * buffer containing the on-disk inode in the bpp parameter, and in + * the dip parameter it returns a pointer to the on-disk inode within + * that buffer. + * + * If a non-zero error is returned, then the contents of bpp and + * dipp are undefined. + * + * If the inode is new and has not yet been initialized, use xfs_imap() + * to determine the size and location of the buffer to read from disk. + * If the inode has already been mapped to its buffer and read in once, + * then use the mapping information stored in the inode rather than + * calling xfs_imap(). This allows us to avoid the overhead of looking + * at the inode btree for small block file systems (see xfs_dilocate()). + * We can tell whether the inode has been mapped in before by comparing + * its disk block address to 0. Only uninitialized inodes will have + * 0 for the disk block address. + */ +int +xfs_itobp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dinode_t **dipp, + xfs_buf_t **bpp, + xfs_daddr_t bno) +{ + xfs_buf_t *bp; + int error; + xfs_imap_t imap; +#ifdef __KERNEL__ + int i; + int ni; +#endif + + if (ip->i_blkno == (xfs_daddr_t)0) { + /* + * Call the space management code to find the location of the + * inode on disk. + */ + imap.im_blkno = bno; + error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP); + if (error != 0) { + return error; + } + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap.im_blkno + imap.im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " + "(imap.im_blkno (0x%llx) " + "+ imap.im_len (0x%llx)) > " + " XFS_FSB_TO_BB(mp, " + "mp->m_sb.sb_dblocks) (0x%llx)", + (unsigned long long) imap.im_blkno, + (unsigned long long) imap.im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); +#endif /* DEBUG */ + return XFS_ERROR(EINVAL); + } + + /* + * Fill in the fields in the inode that will be used to + * map the inode to its buffer from now on. + */ + ip->i_blkno = imap.im_blkno; + ip->i_len = imap.im_len; + ip->i_boffset = imap.im_boffset; + } else { + /* + * We've already mapped the inode once, so just use the + * mapping that we saved the first time. + */ + imap.im_blkno = ip->i_blkno; + imap.im_len = ip->i_len; + imap.im_boffset = ip->i_boffset; + } + ASSERT(bno == 0 || bno == imap.im_blkno); + + /* + * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will + * default to just a read_buf() call. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, + (int)imap.im_len, XFS_BUF_LOCK, &bp); + + if (error) { +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " + "xfs_trans_read_buf() returned error %d, " + "imap.im_blkno 0x%llx, imap.im_len 0x%llx", + error, (unsigned long long) imap.im_blkno, + (unsigned long long) imap.im_len); +#endif /* DEBUG */ + return error; + } +#ifdef __KERNEL__ + /* + * Validate the magic number and version of every inode in the buffer + * (if DEBUG kernel) or the first inode in the buffer, otherwise. + */ +#ifdef DEBUG + ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; +#else + ni = 1; +#endif + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { +#ifdef DEBUG + prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)", + mp->m_ddev_targp, + (unsigned long long)imap.im_blkno, i, + INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); +#endif + XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, + mp, dip); + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + } +#endif /* __KERNEL__ */ + + xfs_inobp_check(mp, bp); + + /* + * Mark the buffer as an inode buffer now that it looks good + */ + XFS_BUF_SET_VTYPE(bp, B_FS_INO); + + /* + * Set *dipp to point to the on-disk inode in the buffer. + */ + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + *bpp = bp; + return 0; +} + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +STATIC int +xfs_iformat( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error; + xfs_fsize_t di_size; + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + error = 0; + + if (unlikely( + INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > + INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu." + " Unmount and run xfs_repair.", + (unsigned long long)ip->i_ino, + (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), + (unsigned long long) + INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, forkoff = 0x%x." + " Unmount and run xfs_repair.", + (unsigned long long)ip->i_ino, + (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); + if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_ext_max = + XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT); + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = XFS_ERROR(EFSCORRUPTED); + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *ep, *dp; + xfs_ifork_t *ifp; + int nex; + int real_size; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + real_size = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else { + ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); + ASSERT(ifp->if_u1.if_extents != NULL); + real_size = size; + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip)); + ep = ifp->if_u1.if_extents; + for (i = 0; i < nex; i++, ep++, dp++) { + ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), + ARCH_CONVERT); + ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), + ARCH_CONVERT); + } + xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, + whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp->if_u1.if_extents, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(dfp); + nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max + || XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) + || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_fs_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu (btree). Unmount and run xfs_repair.", + (unsigned long long) ip->i_ino); + XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk + * and native format + * + * buf = on-disk representation + * dip = native representation + * dir = direction - +ve -> disk to native + * -ve -> native to disk + */ +void +xfs_xlate_dinode_core( + xfs_caddr_t buf, + xfs_dinode_core_t *dip, + int dir) +{ + xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; + xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; + xfs_arch_t arch = ARCH_CONVERT; + + ASSERT(dir); + + INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); + INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); + INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); + INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); + INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); + INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); + INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); + INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); + INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); + + if (dir > 0) { + memcpy(mem_core->di_pad, buf_core->di_pad, + sizeof(buf_core->di_pad)); + } else { + memcpy(buf_core->di_pad, mem_core->di_pad, + sizeof(buf_core->di_pad)); + } + + INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); + + INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, + dir, arch); + INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, + dir, arch); + INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, + dir, arch); + INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, + dir, arch); + INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, + dir, arch); + INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, + dir, arch); + INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); + INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); + INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); + INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); + INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); + INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); + INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); + INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); + INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); + INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); + INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); +} + +STATIC uint +_xfs_dic2xflags( + xfs_dinode_core_t *dic, + __uint16_t di_flags) +{ + uint flags = 0; + + if (di_flags & XFS_DIFLAG_ANY) { + if (di_flags & XFS_DIFLAG_REALTIME) + flags |= XFS_XFLAG_REALTIME; + if (di_flags & XFS_DIFLAG_PREALLOC) + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= XFS_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= XFS_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= XFS_XFLAG_NODUMP; + if (di_flags & XFS_DIFLAG_RTINHERIT) + flags |= XFS_XFLAG_RTINHERIT; + if (di_flags & XFS_DIFLAG_PROJINHERIT) + flags |= XFS_XFLAG_PROJINHERIT; + if (di_flags & XFS_DIFLAG_NOSYMLINKS) + flags |= XFS_XFLAG_NOSYMLINKS; + } + + return flags; +} + +uint +xfs_ip2xflags( + xfs_inode_t *ip) +{ + xfs_dinode_core_t *dic = &ip->i_d; + + return _xfs_dic2xflags(dic, dic->di_flags) | + (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0); +} + +uint +xfs_dic2xflags( + xfs_dinode_core_t *dic) +{ + return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) | + (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0); +} + +/* + * Given a mount structure and an inode number, return a pointer + * to a newly allocated in-core inode coresponding to the given + * inode number. + * + * Initialize the inode's attributes and extent pointers if it + * already has them (it will not if the inode has no links). + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + xfs_inode_t **ipp, + xfs_daddr_t bno) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + xfs_inode_t *ip; + int error; + + ASSERT(xfs_inode_zone != NULL); + + ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); + ip->i_ino = ino; + ip->i_mount = mp; + + /* + * Get pointer's to the on-disk inode and the buffer containing it. + * If the inode number refers to a block outside the file system + * then xfs_itobp() will return NULL. In this case we should + * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will + * know that this is a new incore inode. + */ + error = xfs_itobp(mp, tp, ip, &dip, &bp, bno); + + if (error != 0) { + kmem_zone_free(xfs_inode_zone, ip); + return error; + } + + /* + * Initialize inode's trace buffers. + * Do this before xfs_iformat in case it adds entries. + */ +#ifdef XFS_BMAP_TRACE + ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_BMBT_TRACE + ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_RW_TRACE + ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_ILOCK_TRACE + ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_DIR2_TRACE + ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); +#endif + + /* + * If we got something that isn't an inode it means someone + * (nfs or dmi) has a stale handle. + */ + if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { + kmem_zone_free(xfs_inode_zone, ip); + xfs_trans_brelse(tp, bp); +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " + "dip->di_core.di_magic (0x%x) != " + "XFS_DINODE_MAGIC (0x%x)", + INT_GET(dip->di_core.di_magic, ARCH_CONVERT), + XFS_DINODE_MAGIC); +#endif /* DEBUG */ + return XFS_ERROR(EINVAL); + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_core.di_mode) { + xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, + &(ip->i_d), 1); + error = xfs_iformat(ip, dip); + if (error) { + kmem_zone_free(xfs_inode_zone, ip); + xfs_trans_brelse(tp, bp); +#ifdef DEBUG + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " + "xfs_iformat() returned error %d", + error); +#endif /* DEBUG */ + return error; + } + } else { + ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); + ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); + ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); + ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + /* + * Initialize the per-fork minima and maxima for a new + * inode here. xfs_iformat will do it for old inodes. + */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + } + + INIT_LIST_HEAD(&ip->i_reclaim); + + /* + * The inode format changed when we moved the link count and + * made it 32 bits long. If this is an old format inode, + * convert it in memory to look like a new one. If it gets + * flushed to disk we will convert back before flushing or + * logging it. We zero out the new projid field and the old link + * count field. We'll handle clearing the pad field (the remains + * of the old uuid field) when we actually convert the inode to + * the new format. We don't change the version number so that we + * can distinguish this from a real new format inode. + */ + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + ip->i_d.di_projid = 0; + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + XFS_BUF_SET_REF(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the + * on-disk inode, because it was acquired with xfs_trans_read_buf() + * in xfs_itobp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be + * locking the new in-core inode before putting it in the hash + * table where other processes can find it. Thus we don't have + * to worry about the inode being changed just because we released + * the buffer. + */ + xfs_trans_brelse(tp, bp); + *ipp = ip; + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + size_t size; + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t); + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); + ASSERT(ifp->if_u1.if_extents != NULL); + ifp->if_lastex = NULLEXTNUM; + ifp->if_bytes = ifp->if_real_bytes = (int)size; + ifp->if_flags |= XFS_IFEXTENTS; + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + kmem_free(ifp->if_u1.if_extents, size); + ifp->if_u1.if_extents = NULL; + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents, + XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip)); + return 0; +} + +/* + * Allocate an inode on disk and return a copy of its in-core version. + * The in-core inode is locked exclusively. Set mode, nlink, and rdev + * appropriately within the inode. The uid and gid for the inode are + * set according to the contents of the given cred structure. + * + * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() + * has a free inode available, call xfs_iget() + * to obtain the in-core version of the allocated inode. Finally, + * fill in the inode and log its initial contents. In this case, + * ialloc_context would be set to NULL and call_again set to false. + * + * If xfs_dialloc() does not have an available inode, + * it will replenish its supply by doing an allocation. Since we can + * only do one allocation within a transaction without deadlocks, we + * must commit the current transaction before returning the inode itself. + * In this case, therefore, we will set call_again to true and return. + * The caller should then commit the current transaction, start a new + * transaction, and call xfs_ialloc() again to actually get the inode. + * + * To ensure that some other process does not grab the inode that + * was allocated during the first call to xfs_ialloc(), this routine + * also returns the [locked] bp pointing to the head of the freelist + * as ialloc_context. The caller should hold this buffer across + * the commit and pass it back into this routine on the second call. + */ +int +xfs_ialloc( + xfs_trans_t *tp, + xfs_inode_t *pip, + mode_t mode, + nlink_t nlink, + xfs_dev_t rdev, + cred_t *cr, + xfs_prid_t prid, + int okalloc, + xfs_buf_t **ialloc_context, + boolean_t *call_again, + xfs_inode_t **ipp) +{ + xfs_ino_t ino; + xfs_inode_t *ip; + vnode_t *vp; + uint flags; + int error; + + /* + * Call the space management code to pick + * the on-disk inode to be allocated. + */ + error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, + ialloc_context, call_again, &ino); + if (error != 0) { + return error; + } + if (*call_again || ino == NULLFSINO) { + *ipp = NULL; + return 0; + } + ASSERT(*ialloc_context == NULL); + + /* + * Get the in-core inode with the lock held exclusively. + * This is because we're setting fields here we need + * to prevent others from looking at until we're done. + */ + error = xfs_trans_iget(tp->t_mountp, tp, ino, + IGET_CREATE, XFS_ILOCK_EXCL, &ip); + if (error != 0) { + return error; + } + ASSERT(ip != NULL); + + vp = XFS_ITOV(ip); + vp->v_type = IFTOVT(mode); + ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_onlink = 0; + ip->i_d.di_nlink = nlink; + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = current_fsuid(cr); + ip->i_d.di_gid = current_fsgid(cr); + ip->i_d.di_projid = prid; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + + /* + * If the superblock version is up to where we support new format + * inodes and this is currently an old format inode, then change + * the inode version number now. This way we only do the conversion + * here rather than here and in the flush/logging code. + */ + if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && + ip->i_d.di_version == XFS_DINODE_VERSION_1) { + ip->i_d.di_version = XFS_DINODE_VERSION_2; + /* + * We've already zeroed the old link count, the projid field, + * and the pad field. + */ + } + + /* + * Project ids won't be stored on disk if we are using a version 1 inode. + */ + if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) + xfs_bump_ino_vers2(tp, ip); + + if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { + ip->i_d.di_gid = pip->i_d.di_gid; + if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + ip->i_d.di_mode |= S_ISGID; + } + } + + /* + * If the group ID of the new file does not match the effective group + * ID or one of the supplementary group IDs, the S_ISGID bit is cleared + * (and only if the irix_sgid_inherit compatibility variable is set). + */ + if ((irix_sgid_inherit) && + (ip->i_d.di_mode & S_ISGID) && + (!in_group_p((gid_t)ip->i_d.di_gid))) { + ip->i_d.di_mode &= ~S_ISGID; + } + + ip->i_d.di_size = 0; + ip->i_d.di_nextents = 0; + ASSERT(ip->i_d.di_nblocks == 0); + xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); + /* + * di_gen will have been taken care of in xfs_iread. + */ + ip->i_d.di_extsize = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_u2.if_rdev = rdev; + ip->i_df.if_flags = 0; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { + if ((mode & S_IFMT) == S_IFDIR) { + ip->i_d.di_flags |= XFS_DIFLAG_RTINHERIT; + } else { + ip->i_d.di_flags |= XFS_DIFLAG_REALTIME; + ip->i_iocore.io_flags |= XFS_IOCORE_RT; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + ip->i_d.di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + ip->i_d.di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + ip->i_d.di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + ip->i_d.di_flags |= XFS_DIFLAG_NOSYMLINKS; + } + /* FALLTHROUGH */ + case S_IFLNK: + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_flags = XFS_IFEXTENTS; + ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; + ip->i_df.if_u1.if_extents = NULL; + break; + default: + ASSERT(0); + } + /* + * Attribute fork settings for new inode. + */ + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_anextents = 0; + + /* + * Log the new values stuffed into the inode. + */ + xfs_trans_log_inode(tp, ip, flags); + + /* now that we have a v_type we can set Linux inode ops (& unlock) */ + VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); + + *ipp = ip; + return 0; +} + +/* + * Check to make sure that there are no blocks allocated to the + * file beyond the size of the file. We don't check this for + * files with fixed size extents or real time extents, but we + * at least do it for regular files. + */ +#ifdef DEBUG +void +xfs_isize_check( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_fsize_t isize) +{ + xfs_fileoff_t map_first; + int nimaps; + xfs_bmbt_irec_t imaps[2]; + + if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + return; + + if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) + return; + + nimaps = 2; + map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + /* + * The filesystem could be shutting down, so bmapi may return + * an error. + */ + if (xfs_bmapi(NULL, ip, map_first, + (XFS_B_TO_FSB(mp, + (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - + map_first), + XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, + NULL)) + return; + ASSERT(nimaps == 1); + ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); +} +#endif /* DEBUG */ + +/* + * Calculate the last possible buffered byte in a file. This must + * include data that was buffered beyond the EOF by the write code. + * This also needs to deal with overflowing the xfs_fsize_t type + * which can happen for sizes near the limit. + * + * We also need to take into account any blocks beyond the EOF. It + * may be the case that they were buffered by a write which failed. + * In that case the pages will still be in memory, but the inode size + * will never have been updated. + */ +xfs_fsize_t +xfs_file_last_byte( + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_fsize_t last_byte; + xfs_fileoff_t last_block; + xfs_fileoff_t size_last_block; + int error; + + ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); + + mp = ip->i_mount; + /* + * Only check for blocks beyond the EOF if the extents have + * been read in. This eliminates the need for the inode lock, + * and it also saves us from looking when it really isn't + * necessary. + */ + if (ip->i_df.if_flags & XFS_IFEXTENTS) { + error = xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + if (error) { + last_block = 0; + } + } else { + last_block = 0; + } + size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); + last_block = XFS_FILEOFF_MAX(last_block, size_last_block); + + last_byte = XFS_FSB_TO_B(mp, last_block); + if (last_byte < 0) { + return XFS_MAXIOFFSET(mp); + } + last_byte += (1 << mp->m_writeio_log); + if (last_byte < 0) { + return XFS_MAXIOFFSET(mp); + } + return last_byte; +} + +#if defined(XFS_RW_TRACE) +STATIC void +xfs_itrunc_trace( + int tag, + xfs_inode_t *ip, + int flag, + xfs_fsize_t new_size, + xfs_off_t toss_start, + xfs_off_t toss_finish) +{ + if (ip->i_rwtrace == NULL) { + return; + } + + ktrace_enter(ip->i_rwtrace, + (void*)((long)tag), + (void*)ip, + (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), + (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), + (void*)((long)flag), + (void*)(unsigned long)((new_size >> 32) & 0xffffffff), + (void*)(unsigned long)(new_size & 0xffffffff), + (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), + (void*)(unsigned long)(toss_start & 0xffffffff), + (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), + (void*)(unsigned long)(toss_finish & 0xffffffff), + (void*)(unsigned long)current_cpu(), + (void*)0, + (void*)0, + (void*)0, + (void*)0); +} +#else +#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) +#endif + +/* + * Start the truncation of the file to new_size. The new size + * must be smaller than the current size. This routine will + * clear the buffer and page caches of file data in the removed + * range, and xfs_itruncate_finish() will remove the underlying + * disk blocks. + * + * The inode must have its I/O lock locked EXCLUSIVELY, and it + * must NOT have the inode lock held at all. This is because we're + * calling into the buffer/page cache code and we can't hold the + * inode lock when we do so. + * + * The flags parameter can have either the value XFS_ITRUNC_DEFINITE + * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used + * in the case that the caller is locking things out of order and + * may not be able to call xfs_itruncate_finish() with the inode lock + * held without dropping the I/O lock. If the caller must drop the + * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() + * must be called again with all the same restrictions as the initial + * call. + */ +void +xfs_itruncate_start( + xfs_inode_t *ip, + uint flags, + xfs_fsize_t new_size) +{ + xfs_fsize_t last_byte; + xfs_off_t toss_start; + xfs_mount_t *mp; + vnode_t *vp; + + ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); + ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT((flags == XFS_ITRUNC_DEFINITE) || + (flags == XFS_ITRUNC_MAYBE)); + + mp = ip->i_mount; + vp = XFS_ITOV(ip); + /* + * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers + * overlapping the region being removed. We have to use + * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the + * caller may not be able to finish the truncate without + * dropping the inode's I/O lock. Make sure + * to catch any pages brought in by buffers overlapping + * the EOF by searching out beyond the isize by our + * block size. We round new_size up to a block boundary + * so that we don't toss things on the same block as + * new_size but before it. + * + * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to + * call remapf() over the same region if the file is mapped. + * This frees up mapped file references to the pages in the + * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures + * that we get the latest mapped changes flushed out. + */ + toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + toss_start = XFS_FSB_TO_B(mp, toss_start); + if (toss_start < 0) { + /* + * The place to start tossing is beyond our maximum + * file size, so there is no way that the data extended + * out there. + */ + return; + } + last_byte = xfs_file_last_byte(ip); + xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, + last_byte); + if (last_byte > toss_start) { + if (flags & XFS_ITRUNC_DEFINITE) { + VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); + } else { + VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); + } + } + +#ifdef DEBUG + if (new_size == 0) { + ASSERT(VN_CACHED(vp) == 0); + } +#endif +} + +/* + * Shrink the file to the given new_size. The new + * size must be smaller than the current size. + * This will free up the underlying blocks + * in the removed range after a call to xfs_itruncate_start() + * or xfs_atruncate_start(). + * + * The transaction passed to this routine must have made + * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. + * This routine may commit the given transaction and + * start new ones, so make sure everything involved in + * the transaction is tidy before calling here. + * Some transaction will be returned to the caller to be + * committed. The incoming transaction must already include + * the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On + * return the inode will be "held" within the returned transaction. + * This routine does NOT require any disk space to be reserved + * for it within the transaction. + * + * The fork parameter must be either xfs_attr_fork or xfs_data_fork, + * and it indicates the fork which is to be truncated. For the + * attribute fork we only support truncation to size 0. + * + * We use the sync parameter to indicate whether or not the first + * transaction we perform might have to be synchronous. For the attr fork, + * it needs to be so if the unlink of the inode is not yet known to be + * permanent in the log. This keeps us from freeing and reusing the + * blocks of the attribute fork before the unlink of the inode becomes + * permanent. + * + * For the data fork, we normally have to run synchronously if we're + * being called out of the inactive path or we're being called + * out of the create path where we're truncating an existing file. + * Either way, the truncate needs to be sync so blocks don't reappear + * in the file with altered data in case of a crash. wsync filesystems + * can run the first case async because anything that shrinks the inode + * has to run sync so by the time we're called here from inactive, the + * inode size is permanently set to 0. + * + * Calls from the truncate path always need to be sync unless we're + * in a wsync filesystem and the file has already been unlinked. + * + * The caller is responsible for correctly setting the sync parameter. + * It gets too hard for us to guess here which path we're being called + * out of just based on inode state. + */ +int +xfs_itruncate_finish( + xfs_trans_t **tp, + xfs_inode_t *ip, + xfs_fsize_t new_size, + int fork, + int sync) +{ + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len=0; + xfs_mount_t *mp; + xfs_trans_t *ntp; + int done; + int committed; + xfs_bmap_free_t free_list; + int error; + + ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); + ASSERT(*tp != NULL); + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(ip->i_transp == *tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); + + + ntp = *tp; + mp = (ntp)->t_mountp; + ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); + + /* + * We only support truncating the entire attribute fork. + */ + if (fork == XFS_ATTR_FORK) { + new_size = 0LL; + } + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); + /* + * The first thing we do is set the size to new_size permanently + * on disk. This way we don't have to worry about anyone ever + * being able to look at the data being freed even in the face + * of a crash. What we're getting around here is the case where + * we free a block, it is allocated to another file, it is written + * to, and then we crash. If the new data gets written to the + * file but the log buffers containing the free and reallocation + * don't, then we'd end up with garbage in the blocks being freed. + * As long as we make the new_size permanent before actually + * freeing any blocks it doesn't matter if they get writtten to. + * + * The callers must signal into us whether or not the size + * setting here must be synchronous. There are a few cases + * where it doesn't have to be synchronous. Those cases + * occur if the file is unlinked and we know the unlink is + * permanent or if the blocks being truncated are guaranteed + * to be beyond the inode eof (regardless of the link count) + * and the eof value is permanent. Both of these cases occur + * only on wsync-mounted filesystems. In those cases, we're + * guaranteed that no user will ever see the data in the blocks + * that are being truncated so the truncate can run async. + * In the free beyond eof case, the file may wind up with + * more blocks allocated to it than it needs if we crash + * and that won't get fixed until the next time the file + * is re-opened and closed but that's ok as that shouldn't + * be too many blocks. + * + * However, we can't just make all wsync xactions run async + * because there's one call out of the create path that needs + * to run sync where it's truncating an existing file to size + * 0 whose size is > 0. + * + * It's probably possible to come up with a test in this + * routine that would correctly distinguish all the above + * cases from the values of the function parameters and the + * inode state but for sanity's sake, I've decided to let the + * layers above just tell us. It's simpler to correctly figure + * out in the layer above exactly under what conditions we + * can run async and I think it's easier for others read and + * follow the logic in case something has to be changed. + * cscope is your friend -- rcc. + * + * The attribute fork is much simpler. + * + * For the attribute fork we allow the caller to tell us whether + * the unlink of the inode that led to this call is yet permanent + * in the on disk log. If it is not and we will be freeing extents + * in this inode then we make the first transaction synchronous + * to make sure that the unlink is permanent by the time we free + * the blocks. + */ + if (fork == XFS_DATA_FORK) { + if (ip->i_d.di_nextents > 0) { + ip->i_d.di_size = new_size; + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } + } else if (sync) { + ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); + if (ip->i_d.di_anextents > 0) + xfs_trans_set_sync(ntp); + } + ASSERT(fork == XFS_DATA_FORK || + (fork == XFS_ATTR_FORK && + ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || + (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); + + /* + * Since it is possible for space to become allocated beyond + * the end of the file (in a crash where the space is allocated + * but the inode size is not yet updated), simply remove any + * blocks which show up between the new EOF and the maximum + * possible file size. If the first block to be removed is + * beyond the maximum file size (ie it is the same as last_block), + * then there is nothing to do. + */ + last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + ASSERT(first_unmap_block <= last_block); + done = 0; + if (last_block == first_unmap_block) { + done = 1; + } else { + unmap_len = last_block - first_unmap_block + 1; + } + while (!done) { + /* + * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() + * will tell us whether it freed the entire range or + * not. If this is a synchronous mount (wsync), + * then we can tell bunmapi to keep all the + * transactions asynchronous since the unlink + * transaction that made this inode inactive has + * already hit the disk. There's no danger of + * the freed blocks being reused, there being a + * crash, and the reused blocks suddenly reappearing + * in this file with garbage in them once recovery + * runs. + */ + XFS_BMAP_INIT(&free_list, &first_block); + error = xfs_bunmapi(ntp, ip, first_unmap_block, + unmap_len, + XFS_BMAPI_AFLAG(fork) | + (sync ? 0 : XFS_BMAPI_ASYNC), + XFS_ITRUNC_MAX_EXTENTS, + &first_block, &free_list, &done); + if (error) { + /* + * If the bunmapi call encounters an error, + * return to the caller where the transaction + * can be properly aborted. We just need to + * make sure we're not holding any resources + * that we were not when we came in. + */ + xfs_bmap_cancel(&free_list); + return error; + } + + /* + * Duplicate the transaction that has the permanent + * reservation and commit the old transaction. + */ + error = xfs_bmap_finish(tp, &free_list, first_block, + &committed); + ntp = *tp; + if (error) { + /* + * If the bmap finish call encounters an error, + * return to the caller where the transaction + * can be properly aborted. We just need to + * make sure we're not holding any resources + * that we were not when we came in. + * + * Aborting from this point might lose some + * blocks in the file system, but oh well. + */ + xfs_bmap_cancel(&free_list); + if (committed) { + /* + * If the passed in transaction committed + * in xfs_bmap_finish(), then we want to + * add the inode to this one before returning. + * This keeps things simple for the higher + * level code, because it always knows that + * the inode is locked and held in the + * transaction that returns to it whether + * errors occur or not. We don't mark the + * inode dirty so that this transaction can + * be easily aborted if possible. + */ + xfs_trans_ijoin(ntp, ip, + XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(ntp, ip); + } + return error; + } + + if (committed) { + /* + * The first xact was committed, + * so add the inode to the new one. + * Mark it dirty so it will be logged + * and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(ntp, ip, + XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(ntp, ip); + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + } + ntp = xfs_trans_dup(ntp); + (void) xfs_trans_commit(*tp, 0, NULL); + *tp = ntp; + error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + /* + * Add the inode being truncated to the next chained + * transaction. + */ + xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(ntp, ip); + if (error) + return (error); + } + /* + * Only update the size in the case of the data fork, but + * always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + if (fork == XFS_DATA_FORK) { + xfs_isize_check(mp, ip, new_size); + ip->i_d.di_size = new_size; + } + xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + ASSERT((new_size != 0) || + (fork == XFS_ATTR_FORK) || + (ip->i_delayed_blks == 0)); + ASSERT((new_size != 0) || + (fork == XFS_ATTR_FORK) || + (ip->i_d.di_nextents == 0)); + xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); + return 0; +} + + +/* + * xfs_igrow_start + * + * Do the first part of growing a file: zero any data in the last + * block that is beyond the old EOF. We need to do this before + * the inode is joined to the transaction to modify the i_size. + * That way we can drop the inode lock and call into the buffer + * cache to get the buffer mapping the EOF. + */ +int +xfs_igrow_start( + xfs_inode_t *ip, + xfs_fsize_t new_size, + cred_t *credp) +{ + xfs_fsize_t isize; + int error; + + ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); + ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); + ASSERT(new_size > ip->i_d.di_size); + + error = 0; + isize = ip->i_d.di_size; + /* + * Zero any pages that may have been created by + * xfs_write_file() beyond the end of the file + * and any blocks between the old and new file sizes. + */ + error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, + new_size); + return error; +} + +/* + * xfs_igrow_finish + * + * This routine is called to extend the size of a file. + * The inode must have both the iolock and the ilock locked + * for update and it must be a part of the current transaction. + * The xfs_igrow_start() function must have been called previously. + * If the change_flag is not zero, the inode change timestamp will + * be updated. + */ +void +xfs_igrow_finish( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_fsize_t new_size, + int change_flag) +{ + ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); + ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); + ASSERT(ip->i_transp == tp); + ASSERT(new_size > ip->i_d.di_size); + + /* + * Update the file size. Update the inode change timestamp + * if change_flag set. + */ + ip->i_d.di_size = new_size; + if (change_flag) + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + +} + + +/* + * This is called when the inode's link count goes to 0. + * We place the on-disk inode on a list in the AGI. It + * will be pulled from this list when the inode is freed. + */ +int +xfs_iunlink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agnumber_t agno; + xfs_daddr_t agdaddr; + xfs_agino_t agino; + short bucket_index; + int offset; + int error; + int agi_ok; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + ASSERT(ip->i_transp == tp); + + mp = tp->t_mountp; + + agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, + XFS_FSS_TO_BB(mp, 1), 0, &agibp); + if (error) { + return error; + } + /* + * Validate the magic number of the agi block. + */ + agi = XFS_BUF_TO_AGI(agibp); + agi_ok = + INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT)); + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, + XFS_RANDOM_IUNLINK))) { + XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); + xfs_trans_brelse(tp, agibp); + return XFS_ERROR(EFSCORRUPTED); + } + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index]); + ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino); + + if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) { + /* + * There is already another inode in the bucket we need + * to add ourselves to. Add us at the front of the list. + * Here we put the head pointer into our next pointer, + * and then we fall through to point the head at us. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + if (error) { + return error; + } + ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); + ASSERT(dip->di_next_unlinked); + /* both on-disk, don't endian flip twice */ + dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; + offset = ip->i_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } + + /* + * Point the bucket head pointer at the inode being inserted. + */ + ASSERT(agino != 0); + INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + return 0; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_ino_t next_ino; + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agnumber_t agno; + xfs_daddr_t agdaddr; + xfs_agino_t agino; + xfs_agino_t next_agino; + xfs_buf_t *last_ibp; + xfs_dinode_t *last_dip; + short bucket_index; + int offset, last_offset; + int error; + int agi_ok; + + /* + * First pull the on-disk inode from the AGI unlinked list. + */ + mp = tp->t_mountp; + + agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, + XFS_FSS_TO_BB(mp, 1), 0, &agibp); + if (error) { + cmn_err(CE_WARN, + "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + /* + * Validate the magic number of the agi block. + */ + agi = XFS_BUF_TO_AGI(agibp); + agi_ok = + INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT)); + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, + XFS_RANDOM_IUNLINK_REMOVE))) { + XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, + mp, agi); + xfs_trans_brelse(tp, agibp); + cmn_err(CE_WARN, + "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", + mp->m_fsname); + return XFS_ERROR(EFSCORRUPTED); + } + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO); + ASSERT(agi->agi_unlinked[bucket_index]); + + if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) { + /* + * We're at the head of the list. Get the inode's + * on-disk buffer to see if there is anyone after us + * on the list. Only modify our next pointer if it + * is not already NULLAGINO. This saves us the overhead + * of dealing with the buffer when there is no need to + * change it. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + if (error) { + cmn_err(CE_WARN, + "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + ASSERT(next_agino != 0); + if (next_agino != NULLAGINO) { + INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + offset = ip->i_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the bucket head pointer at the next inode. + */ + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + } else { + /* + * We need to search the list for the inode being freed. + */ + next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT); + last_ibp = NULL; + while (next_agino != agino) { + /* + * If the last inode wasn't the one pointing to + * us, then release its buffer since we're not + * going to do anything with it. + */ + if (last_ibp != NULL) { + xfs_trans_brelse(tp, last_ibp); + } + next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + error = xfs_inotobp(mp, tp, next_ino, &last_dip, + &last_ibp, &last_offset); + if (error) { + cmn_err(CE_WARN, + "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); + ASSERT(next_agino != NULLAGINO); + ASSERT(next_agino != 0); + } + /* + * Now last_ibp points to the buffer previous to us on + * the unlinked list. Pull us from the list. + */ + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); + if (error) { + cmn_err(CE_WARN, + "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + if (next_agino != NULLAGINO) { + INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); + offset = ip->i_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the previous inode on the list to the next inode. + */ + INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); + ASSERT(next_agino != 0); + offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + xfs_trans_inode_buf(tp, last_ibp); + xfs_trans_log_buf(tp, last_ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, last_ibp); + } + return 0; +} + +static __inline__ int xfs_inode_clean(xfs_inode_t *ip) +{ + return (((ip->i_itemp == NULL) || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + (ip->i_update_core == 0)); +} + +void +xfs_ifree_cluster( + xfs_inode_t *free_ip, + xfs_trans_t *tp, + xfs_ino_t inum) +{ + xfs_mount_t *mp = free_ip->i_mount; + int blks_per_cluster; + int nbufs; + int ninodes; + int i, j, found, pre_flushed; + xfs_daddr_t blkno; + xfs_buf_t *bp; + xfs_ihash_t *ih; + xfs_inode_t *ip, **ip_found; + xfs_inode_log_item_t *iip; + xfs_log_item_t *lip; + SPLDECL(s); + + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + ninodes = mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp); + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; + } + + ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); + + for (j = 0; j < nbufs; j++, inum += ninodes) { + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), + XFS_INO_TO_AGBNO(mp, inum)); + + + /* + * Look for each inode in memory and attempt to lock it, + * we can be racing with flush and tail pushing here. + * any inode we get the locks on, add to an array of + * inode items to process later. + * + * The get the buffer lock, we could beat a flush + * or tail pushing thread to the lock here, in which + * case they will go looking for the inode buffer + * and fail, we need some other form of interlock + * here. + */ + found = 0; + for (i = 0; i < ninodes; i++) { + ih = XFS_IHASH(mp, inum + i); + read_lock(&ih->ih_lock); + for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { + if (ip->i_ino == inum + i) + break; + } + + /* Inode not in memory or we found it already, + * nothing to do + */ + if (!ip || (ip->i_flags & XFS_ISTALE)) { + read_unlock(&ih->ih_lock); + continue; + } + + if (xfs_inode_clean(ip)) { + read_unlock(&ih->ih_lock); + continue; + } + + /* If we can get the locks then add it to the + * list, otherwise by the time we get the bp lock + * below it will already be attached to the + * inode buffer. + */ + + /* This inode will already be locked - by us, lets + * keep it that way. + */ + + if (ip == free_ip) { + if (xfs_iflock_nowait(ip)) { + ip->i_flags |= XFS_ISTALE; + + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + } else { + ip_found[found++] = ip; + } + } + read_unlock(&ih->ih_lock); + continue; + } + + if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + if (xfs_iflock_nowait(ip)) { + ip->i_flags |= XFS_ISTALE; + + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + ip_found[found++] = ip; + } + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + } + + read_unlock(&ih->ih_lock); + } + + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XFS_BUF_LOCK); + + pre_flushed = 0; + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; + AIL_LOCK(mp,s); + iip->ili_flush_lsn = iip->ili_item.li_lsn; + AIL_UNLOCK(mp, s); + iip->ili_inode->i_flags |= XFS_ISTALE; + pre_flushed++; + } + lip = lip->li_bio_list; + } + + for (i = 0; i < found; i++) { + ip = ip_found[i]; + iip = ip->i_itemp; + + if (!iip) { + ip->i_update_core = 0; + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + + iip->ili_last_fields = iip->ili_format.ilf_fields; + iip->ili_format.ilf_fields = 0; + iip->ili_logged = 1; + AIL_LOCK(mp,s); + iip->ili_flush_lsn = iip->ili_item.li_lsn; + AIL_UNLOCK(mp, s); + + xfs_buf_attach_iodone(bp, + (void(*)(xfs_buf_t*,xfs_log_item_t*)) + xfs_istale_done, (xfs_log_item_t *)iip); + if (ip != free_ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + } + + if (found || pre_flushed) + xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_binval(tp, bp); + } + + kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); +} + +/* + * This is called to return an inode to the inode free list. + * The inode should already be truncated to 0 length and have + * no pages associated with it. This routine also assumes that + * the inode is already a part of the transaction. + * + * The on-disk copy of the inode will have been added to the list + * of unlinked inodes in the AGI. We need to remove the inode from + * that list atomically with respect to freeing it here. + */ +int +xfs_ifree( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_bmap_free_t *flist) +{ + int error; + int delete; + xfs_ino_t first_ino; + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT((ip->i_d.di_size == 0) || + ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + ASSERT(ip->i_d.di_nblocks == 0); + + /* + * Pull the on-disk inode from the AGI unlinked list. + */ + error = xfs_iunlink_remove(tp, ip); + if (error != 0) { + return error; + } + + error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + if (error != 0) { + return error; + } + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_ext_max = + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + ip->i_d.di_gen++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (delete) { + xfs_ifree_cluster(ip, tp, first_ino); + } + + return 0; +} + +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we adding records one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + int cur_max; + xfs_ifork_t *ifp; + xfs_bmbt_block_t *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, + KM_SLEEP); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); + new_max = cur_max + rec_diff; + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + ifp->if_broot = (xfs_bmbt_block_t *) + kmem_realloc(ifp->if_broot, + new_size, + (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + KM_SLEEP); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(ifp->if_broot_bytes <= + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + } + kmem_free(ifp->if_broot, ifp->if_broot_bytes); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + ASSERT(ifp->if_broot_bytes <= + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + return; +} + + +/* + * This is called when the amount of space needed for if_extents + * is increased or decreased. The change in size is indicated by + * the number of extents that need to be added or deleted in the + * ext_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_extents area is changing + * ext_diff -- the change in the number of extents, positive or negative, + * requested for the if_extents array. + */ +void +xfs_iext_realloc( + xfs_inode_t *ip, + int ext_diff, + int whichfork) +{ + int byte_diff; + xfs_ifork_t *ifp; + int new_size; + uint rnew_size; + + if (ext_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + } + ifp->if_u1.if_extents = NULL; + rnew_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) { + /* + * If the valid extents can fit in if_inline_ext, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { + /* + * For now, empty files are format EXTENTS, + * so the if_extents pointer is null. + */ + if (ifp->if_u1.if_extents) { + memcpy(ifp->if_u2.if_inline_ext, + ifp->if_u1.if_extents, new_size); + kmem_free(ifp->if_u1.if_extents, + ifp->if_real_bytes); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + } + rnew_size = 0; + } else { + rnew_size = new_size; + if ((rnew_size & (rnew_size - 1)) != 0) + rnew_size = xfs_iroundup(rnew_size); + /* + * Stuck with malloc/realloc. + */ + if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) { + ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) + kmem_alloc(rnew_size, KM_SLEEP); + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + sizeof(ifp->if_u2.if_inline_ext)); + } else if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, + KM_NOFS); + } + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + + + + +/* + * Map inode to disk block and offset. + * + * mp -- the mount point structure for the current file system + * tp -- the current transaction + * ino -- the inode number of the inode to be located + * imap -- this structure is filled in with the information necessary + * to retrieve the given inode from disk + * flags -- flags to pass to xfs_dilocate indicating whether or not + * lookups in the inode btree were OK or not + */ +int +xfs_imap( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + xfs_imap_t *imap, + uint flags) +{ + xfs_fsblock_t fsbno; + int len; + int off; + int error; + + fsbno = imap->im_blkno ? + XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; + error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); + if (error != 0) { + return error; + } + imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); + imap->im_len = XFS_FSB_TO_BB(mp, len); + imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); + imap->im_ioffset = (ushort)off; + imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); + return 0; +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot, ifp->if_broot_bytes); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + (ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * This is called free all the memory associated with an inode. + * It must free the inode itself and any buffers allocated for + * if_extents/if_data and if_broot. It must also free the lock + * associated with the inode. + */ +void +xfs_idestroy( + xfs_inode_t *ip) +{ + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + mrfree(&ip->i_lock); + mrfree(&ip->i_iolock); + freesema(&ip->i_flock); +#ifdef XFS_BMAP_TRACE + ktrace_free(ip->i_xtrace); +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(ip->i_btrace); +#endif +#ifdef XFS_RW_TRACE + ktrace_free(ip->i_rwtrace); +#endif +#ifdef XFS_ILOCK_TRACE + ktrace_free(ip->i_lock_trace); +#endif +#ifdef XFS_DIR2_TRACE + ktrace_free(ip->i_dir_trace); +#endif + if (ip->i_itemp) { + /* XXXdpd should be able to assert this but shutdown + * is leaving the AIL behind. */ + ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + xfs_inode_item_destroy(ip); + } + kmem_zone_free(xfs_inode_zone, ip); +} + + +/* + * Increment the pin count of the given buffer. + * This value is protected by ipinlock spinlock in the mount structure. + */ +void +xfs_ipin( + xfs_inode_t *ip) +{ + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + + atomic_inc(&ip->i_pincount); +} + +/* + * Decrement the pin count of the given inode, and wake up + * anyone in xfs_iwait_unpin() if the count goes to 0. The + * inode must have been previoulsy pinned with a call to xfs_ipin(). + */ +void +xfs_iunpin( + xfs_inode_t *ip) +{ + ASSERT(atomic_read(&ip->i_pincount) > 0); + + if (atomic_dec_and_test(&ip->i_pincount)) { + vnode_t *vp = XFS_ITOV_NULL(ip); + + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = LINVFS_GET_IP(vp); + + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } + + wake_up(&ip->i_ipin_wait); + } +} + +/* + * This is called to wait for the given inode to be unpinned. + * It will sleep until this happens. The caller must have the + * inode locked in at least shared mode so that the buffer cannot + * be subsequently pinned once someone is waiting for it to be + * unpinned. + */ +void +xfs_iunpin_wait( + xfs_inode_t *ip) +{ + xfs_inode_log_item_t *iip; + xfs_lsn_t lsn; + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); + + if (atomic_read(&ip->i_pincount) == 0) { + return; + } + + iip = ip->i_itemp; + if (iip && iip->ili_last_lsn) { + lsn = iip->ili_last_lsn; + } else { + lsn = (xfs_lsn_t)0; + } + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); + + wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +} + + +/* + * xfs_iextents_copy() + * + * This is called to copy the REAL extents (as opposed to the delayed + * allocation extents) from the inode into the given buffer. It + * returns the number of bytes copied into the buffer. + * + * If there are no delayed allocation extents, then we can just + * memcpy() the extents into the buffer. Otherwise, we need to + * examine each extent in turn and skip those which are delayed. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *buffer, + int whichfork) +{ + int copied; + xfs_bmbt_rec_t *dest_ep; + xfs_bmbt_rec_t *ep; +#ifdef XFS_BMAP_TRACE + static char fname[] = "xfs_iextents_copy"; +#endif + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + ep = ifp->if_u1.if_extents; + dest_ep = buffer; + copied = 0; + for (i = 0; i < nrecs; i++) { + start_block = xfs_bmbt_get_startblock(ep); + if (ISNULLSTARTBLOCK(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + ep++; + continue; + } + + /* Translate to on disk format */ + put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), + (__uint64_t*)&dest_ep->l0); + put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), + (__uint64_t*)&dest_ep->l1); + dest_ep++; + ep++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +/*ARGSUSED*/ +STATIC int +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork, + xfs_buf_t *bp) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; +#ifdef XFS_TRANS_DEBUG + int first; +#endif + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (iip == NULL) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (ifp == NULL) { + ASSERT(whichfork == XFS_ATTR_FORK); + return 0; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + if (whichfork == XFS_DATA_FORK) { + if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) { + XFS_ERROR_REPORT("xfs_iflush_fork", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_format.ilf_fields & extflag[whichfork])); + ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0)); + ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0)); + if ((iip->ili_format.ilf_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(ifp->if_broot_bytes <= + (XFS_IFORK_SIZE(ip, whichfork) + + XFS_BROOT_SIZE_ADJ)); + xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } + + return 0; +} + +/* + * xfs_iflush() will write a modified inode's changes out to the + * inode's on disk home. The caller must have the inode lock held + * in at least shared mode and the inode flush semaphore must be + * held as well. The inode lock will still be held upon return from + * the call and the caller is free to unlock it. + * The inode flush lock will be unlocked when the inode reaches the disk. + * The flags indicate how the inode's buffer should be written out. + */ +int +xfs_iflush( + xfs_inode_t *ip, + uint flags) +{ + xfs_inode_log_item_t *iip; + xfs_buf_t *bp; + xfs_dinode_t *dip; + xfs_mount_t *mp; + int error; + /* REFERENCED */ + xfs_chash_t *ch; + xfs_inode_t *iq; + int clcount; /* count of inodes clustered */ + int bufwasdelwri; + enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; + SPLDECL(s); + + XFS_STATS_INC(xs_iflush_count); + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(valusema(&ip->i_flock) <= 0); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > ip->i_df.if_ext_max); + + iip = ip->i_itemp; + mp = ip->i_mount; + + /* + * If the inode isn't dirty, then just release the inode + * flush lock and do nothing. + */ + if ((ip->i_update_core == 0) && + ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + ASSERT((iip != NULL) ? + !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); + xfs_ifunlock(ip); + return 0; + } + + /* + * We can't flush the inode until it is unpinned, so + * wait for it. We know noone new can pin it, because + * we are holding the inode lock shared and you need + * to hold it exclusively to pin the inode. + */ + xfs_iunpin_wait(ip); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk! + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + ip->i_update_core = 0; + if (iip) + iip->ili_format.ilf_fields = 0; + xfs_ifunlock(ip); + return XFS_ERROR(EIO); + } + + /* + * Get the buffer containing the on-disk inode. + */ + error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0); + if (error != 0) { + xfs_ifunlock(ip); + return error; + } + + /* + * Decide how buffer will be flushed out. This is done before + * the call to xfs_iflush_int because this field is zeroed by it. + */ + if (iip != NULL && iip->ili_format.ilf_fields != 0) { + /* + * Flush out the inode buffer according to the directions + * of the caller. In the cases where the caller has given + * us a choice choose the non-delwri case. This is because + * the inode is in the AIL and we need to get it out soon. + */ + switch (flags) { + case XFS_IFLUSH_SYNC: + case XFS_IFLUSH_DELWRI_ELSE_SYNC: + flags = 0; + break; + case XFS_IFLUSH_ASYNC: + case XFS_IFLUSH_DELWRI_ELSE_ASYNC: + flags = INT_ASYNC; + break; + case XFS_IFLUSH_DELWRI: + flags = INT_DELWRI; + break; + default: + ASSERT(0); + flags = 0; + break; + } + } else { + switch (flags) { + case XFS_IFLUSH_DELWRI_ELSE_SYNC: + case XFS_IFLUSH_DELWRI_ELSE_ASYNC: + case XFS_IFLUSH_DELWRI: + flags = INT_DELWRI; + break; + case XFS_IFLUSH_ASYNC: + flags = INT_ASYNC; + break; + case XFS_IFLUSH_SYNC: + flags = 0; + break; + default: + ASSERT(0); + flags = 0; + break; + } + } + + /* + * First flush out the inode that xfs_iflush was called with. + */ + error = xfs_iflush_int(ip, bp); + if (error) { + goto corrupt_out; + } + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + + ip->i_chash->chl_buf = bp; + + ch = XFS_CHASH(mp, ip->i_blkno); + s = mutex_spinlock(&ch->ch_lock); + + clcount = 0; + for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + iip = iq->i_itemp; + if ((iq->i_update_core == 0) && + ((iip == NULL) || + !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && + xfs_ipincount(iq) == 0) { + continue; + } + + /* + * Try to get locks. If any are unavailable, + * then this inode cannot be flushed and is skipped. + */ + + /* get inode locks (just i_lock) */ + if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { + /* get inode flush lock */ + if (xfs_iflock_nowait(iq)) { + /* check if pinned */ + if (xfs_ipincount(iq) == 0) { + /* arriving here means that + * this inode can be flushed. + * first re-check that it's + * dirty + */ + iip = iq->i_itemp; + if ((iq->i_update_core != 0)|| + ((iip != NULL) && + (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + clcount++; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, + XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + } else { + xfs_ifunlock(iq); + } + } else { + xfs_ifunlock(iq); + } + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + } + mutex_spinunlock(&ch->ch_lock, s); + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (XFS_BUF_ISPINNED(bp)){ + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + } + + if (flags & INT_DELWRI) { + xfs_bdwrite(mp, bp); + } else if (flags & INT_ASYNC) { + xfs_bawrite(mp, bp); + } else { + error = xfs_bwrite(mp, bp); + } + return error; + +corrupt_out: + xfs_buf_relse(bp); + xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); + xfs_iflush_abort(ip); + /* + * Unlocks the flush lock + */ + return XFS_ERROR(EFSCORRUPTED); + +cluster_corrupt_out: + /* Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + mutex_spinunlock(&ch->ch_lock, s); + + /* + * Clean up the buffer. If it was B_DELWRI, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { + xfs_buf_relse(bp); + } + + xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); + + if(!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (XFS_BUF_IODONE_FUNC(bp)) { + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_SHUT(bp); + XFS_BUF_ERROR(bp,EIO); + xfs_biodone(bp); + } else { + XFS_BUF_STALE(bp); + xfs_buf_relse(bp); + } + } + + xfs_iflush_abort(iq); + /* + * Unlocks the flush lock + */ + return XFS_ERROR(EFSCORRUPTED); +} + + +STATIC int +xfs_iflush_int( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_inode_log_item_t *iip; + xfs_dinode_t *dip; + xfs_mount_t *mp; +#ifdef XFS_TRANS_DEBUG + int first; +#endif + SPLDECL(s); + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(valusema(&ip->i_flock) <= 0); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > ip->i_df.if_ext_max); + + iip = ip->i_itemp; + mp = ip->i_mount; + + + /* + * If the inode isn't dirty, then just release the inode + * flush lock and do nothing. + */ + if ((ip->i_update_core == 0) && + ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + xfs_ifunlock(ip); + return 0; + } + + /* set *dip = inode's place in the buffer */ + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); + + /* + * Clear i_update_core before copying out the data. + * This is for coordination with our timestamp updates + * that don't hold the inode lock. They will always + * update the timestamps BEFORE setting i_update_core, + * so if we clear i_update_core after they set it we + * are guaranteed to see their updates to the timestamps. + * I believe that this depends on strongly ordered memory + * semantics, but we have that. We use the SYNCHRONIZE + * macro to make sure that the compiler does not reorder + * the i_update_core access below the data copy below. + */ + ip->i_update_core = 0; + SYNCHRONIZE(); + + if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", + ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", + ip->i_ino, ip, ip->i_d.di_magic); + goto corrupt_out; + } + if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", + ip->i_ino, ip); + goto corrupt_out; + } + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", + ip->i_ino, ip); + goto corrupt_out; + } + } + if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, + XFS_RANDOM_IFLUSH_5)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", + ip->i_ino, + ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_d.di_nblocks, + ip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, + mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, + "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + ip->i_ino, ip->i_d.di_forkoff, ip); + goto corrupt_out; + } + /* + * bump the flush iteration count, used to detect flushes which + * postdate a log record during recovery. + */ + + ip->i_d.di_flushiter++; + + /* + * Copy the dirty parts of the inode into the on-disk + * inode. We always copy out the core of the inode, + * because if the inode is dirty at all the core must + * be. + */ + xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + + /* + * If this is really an old format inode and the superblock version + * has not been updated to support only new format inodes, then + * convert back to the old inode format. If the superblock version + * has been updated, then make the conversion permanent. + */ + ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || + XFS_SB_VERSION_HASNLINK(&mp->m_sb)); + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = XFS_DINODE_VERSION_2; + INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); + ip->i_d.di_onlink = 0; + dip->di_core.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + memset(&(dip->di_core.di_pad[0]), 0, + sizeof(dip->di_core.di_pad)); + ASSERT(ip->i_d.di_projid == 0); + } + } + + if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { + goto corrupt_out; + } + + if (XFS_IFORK_Q(ip)) { + /* + * The only error from xfs_iflush_fork is on the data fork. + */ + (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + } + xfs_inobp_check(mp, bp); + + /* + * We've recorded everything logged in the inode, so we'd + * like to clear the ilf_fields bits so we don't log and + * flush things unnecessarily. However, we can't stop + * logging all this information until the data we've copied + * into the disk buffer is written to disk. If we did we might + * overwrite the copy of the inode in the log with all the + * data after re-logging only part of it, and in the face of + * a crash we wouldn't have all the data we need to recover. + * + * What we do is move the bits to the ili_last_fields field. + * When logging the inode, these bits are moved back to the + * ilf_fields field. In the xfs_iflush_done() routine we + * clear ili_last_fields, since we know that the information + * those bits represent is permanently on disk. As long as + * the flush completes before the inode is logged again, then + * both ilf_fields and ili_last_fields will be cleared. + * + * We can play with the ilf_fields bits here, because the inode + * lock must be held exclusively in order to set bits there + * and the flush lock protects the ili_last_fields bits. + * Set ili_logged so the flush done + * routine can tell whether or not to look in the AIL. + * Also, store the current LSN of the inode so that we can tell + * whether the item has moved in the AIL from xfs_iflush_done(). + * In order to read the lsn we need the AIL lock, because + * it is a 64 bit value that cannot be read atomically. + */ + if (iip != NULL && iip->ili_format.ilf_fields != 0) { + iip->ili_last_fields = iip->ili_format.ilf_fields; + iip->ili_format.ilf_fields = 0; + iip->ili_logged = 1; + + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + AIL_LOCK(mp,s); + iip->ili_flush_lsn = iip->ili_item.li_lsn; + AIL_UNLOCK(mp, s); + + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) + xfs_iflush_done, (xfs_log_item_t *)iip); + + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); + } else { + /* + * We're flushing an inode which is not in the AIL and has + * not been logged but has i_update_core set. For this + * case we can use a B_DELWRI flush and immediately drop + * the inode flush lock because we can avoid the whole + * AIL state thing. It's OK to drop the flush lock now, + * because we've already locked the buffer and to do anything + * you really need both. + */ + if (iip != NULL) { + ASSERT(iip->ili_logged == 0); + ASSERT(iip->ili_last_fields == 0); + ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); + } + xfs_ifunlock(ip); + } + + return 0; + +corrupt_out: + return XFS_ERROR(EFSCORRUPTED); +} + + +/* + * Flush all inactive inodes in mp. Return true if no user references + * were found, false otherwise. + */ +int +xfs_iflush_all( + xfs_mount_t *mp, + int flag) +{ + int busy; + int done; + int purged; + xfs_inode_t *ip; + vmap_t vmap; + vnode_t *vp; + + busy = done = 0; + while (!done) { + purged = 0; + XFS_MOUNT_ILOCK(mp); + ip = mp->m_inodes; + if (ip == NULL) { + break; + } + do { + /* Make sure we skip markers inserted by sync */ + if (ip->i_mount == NULL) { + ip = ip->i_mnext; + continue; + } + + /* + * It's up to our caller to purge the root + * and quota vnodes later. + */ + vp = XFS_ITOV_NULL(ip); + + if (!vp) { + XFS_MOUNT_IUNLOCK(mp); + xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); + purged = 1; + break; + } + + if (vn_count(vp) != 0) { + if (vn_count(vp) == 1 && + (ip == mp->m_rootip || + (mp->m_quotainfo && + (ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino)))) { + + ip = ip->i_mnext; + continue; + } + if (!(flag & XFS_FLUSH_ALL)) { + busy = 1; + done = 1; + break; + } + /* + * Ignore busy inodes but continue flushing + * others. + */ + ip = ip->i_mnext; + continue; + } + /* + * Sample vp mapping while holding mp locked on MP + * systems, so we don't purge a reclaimed or + * nonexistent vnode. We break from the loop + * since we know that we modify + * it by pulling ourselves from it in xfs_reclaim() + * called via vn_purge() below. Set ip to the next + * entry in the list anyway so we'll know below + * whether we reached the end or not. + */ + VMAP(vp, vmap); + XFS_MOUNT_IUNLOCK(mp); + + vn_purge(vp, &vmap); + + purged = 1; + break; + } while (ip != mp->m_inodes); + /* + * We need to distinguish between when we exit the loop + * after a purge and when we simply hit the end of the + * list. We can't use the (ip == mp->m_inodes) test, + * because when we purge an inode at the start of the list + * the next inode on the list becomes mp->m_inodes. That + * would cause such a test to bail out early. The purged + * variable tells us how we got out of the loop. + */ + if (!purged) { + done = 1; + } + } + XFS_MOUNT_IUNLOCK(mp); + return !busy; +} + + +/* + * xfs_iaccess: check accessibility of inode for mode. + */ +int +xfs_iaccess( + xfs_inode_t *ip, + mode_t mode, + cred_t *cr) +{ + int error; + mode_t orgmode = mode; + struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); + + if (mode & S_IWUSR) { + umode_t imode = inode->i_mode; + + if (IS_RDONLY(inode) && + (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) + return XFS_ERROR(EROFS); + + if (IS_IMMUTABLE(inode)) + return XFS_ERROR(EACCES); + } + + /* + * If there's an Access Control List it's used instead of + * the mode bits. + */ + if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1) + return error ? XFS_ERROR(error) : 0; + + if (current_fsuid(cr) != ip->i_d.di_uid) { + mode >>= 3; + if (!in_group_p((gid_t)ip->i_d.di_gid)) + mode >>= 3; + } + + /* + * If the DACs are ok we don't need any capability check. + */ + if ((ip->i_d.di_mode & mode) == mode) + return 0; + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + if (!(orgmode & S_IXUSR) || + (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) + if (capable_cred(cr, CAP_DAC_OVERRIDE)) + return 0; + + if ((orgmode == S_IRUSR) || + (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) { + if (capable_cred(cr, CAP_DAC_READ_SEARCH)) + return 0; +#ifdef NOISE + cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode); +#endif /* NOISE */ + return XFS_ERROR(EACCES); + } + return XFS_ERROR(EACCES); +} + +/* + * xfs_iroundup: round up argument to next power of two + */ +uint +xfs_iroundup( + uint v) +{ + int i; + uint m; + + if ((v & (v - 1)) == 0) + return v; + ASSERT((v & 0x80000000) == 0); + if ((v & (v + 1)) == 0) + return v + 1; + for (i = 0, m = 1; i < 31; i++, m <<= 1) { + if (v & m) + continue; + v |= m; + if ((v & (v + 1)) == 0) + return v + 1; + } + ASSERT(0); + return( 0 ); +} + +/* + * Change the requested timestamp in the given inode. + * We don't lock across timestamp updates, and we don't log them but + * we do record the fact that there is dirty information in core. + * + * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG + * with XFS_ICHGTIME_ACC to be sure that access time + * update will take. Calling first with XFS_ICHGTIME_ACC + * and then XFS_ICHGTIME_MOD may fail to modify the access + * timestamp if the filesystem is mounted noacctm. + */ +void +xfs_ichgtime(xfs_inode_t *ip, + int flags) +{ + timespec_t tv; + vnode_t *vp = XFS_ITOV(ip); + struct inode *inode = LINVFS_GET_IP(vp); + + /* + * We're not supposed to change timestamps in readonly-mounted + * filesystems. Throw it away if anyone asks us. + */ + if (unlikely(vp->v_vfsp->vfs_flag & VFS_RDONLY)) + return; + + /* + * Don't update access timestamps on reads if mounted "noatime" + * Throw it away if anyone asks us. + */ + if ((ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) && + ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) + == XFS_ICHGTIME_ACC)) + return; + + nanotime(&tv); + if (flags & XFS_ICHGTIME_MOD) { + VN_MTIMESET(vp, &tv); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + } + if (flags & XFS_ICHGTIME_ACC) { + VN_ATIMESET(vp, &tv); + ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; + } + if (flags & XFS_ICHGTIME_CHG) { + VN_CTIMESET(vp, &tv); + ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; + } + + /* + * We update the i_update_core field _after_ changing + * the timestamps in order to coordinate properly with + * xfs_iflush() so that we don't lose timestamp updates. + * This keeps us from having to hold the inode lock + * while doing this. We use the SYNCHRONIZE macro to + * ensure that the compiler does not reorder the update + * of i_update_core above the timestamp updates above. + */ + SYNCHRONIZE(); + ip->i_update_core = 1; + if (!(inode->i_state & I_LOCK)) + mark_inode_dirty_sync(inode); +} + +#ifdef XFS_ILOCK_TRACE +ktrace_t *xfs_ilock_trace_buf; + +void +xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) +{ + ktrace_enter(ip->i_lock_trace, + (void *)ip, + (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ + (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ + (void *)ra, /* caller of ilock */ + (void *)(unsigned long)current_cpu(), + (void *)(unsigned long)current_pid(), + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); +} +#endif diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h new file mode 100644 index 00000000000..a53b1ccf607 --- /dev/null +++ b/fs/xfs/xfs_inode.h @@ -0,0 +1,554 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_INODE_H__ +#define __XFS_INODE_H__ + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + xfs_bmbt_block_t *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + unsigned char if_ext_max; /* max # of extent records */ + xfs_extnum_t if_lastex; /* last if_extents used */ + union { + xfs_bmbt_rec_t *if_extents; /* linear map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Flags for xfs_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */ +#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */ + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x0001 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x0002 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x0004 /* i_broot points to the bmap b-tree root */ + +/* + * Flags for xfs_imap() and xfs_dilocate(). + */ +#define XFS_IMAP_LOOKUP 0x1 + +/* + * Maximum number of extent pointers in if_u1.if_extents. + */ +#define XFS_MAX_INCORE_EXTENTS 32768 + + +#ifdef __KERNEL__ +struct bhv_desc; +struct cred; +struct ktrace; +struct vnode; +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_bmbt_irec; +struct xfs_bmbt_block; +struct xfs_inode; +struct xfs_inode_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot; + +#if defined(XFS_ILOCK_TRACE) +#define XFS_ILOCK_KTRACE_SIZE 32 +extern ktrace_t *xfs_ilock_trace_buf; +extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *); +#else +#define xfs_ilock_trace(i,n,f,ra) +#endif + +/* + * This structure is used to communicate which extents of a file + * were holes when a write started from xfs_write_file() to + * xfs_strat_read(). This is necessary so that we can know which + * blocks need to be zeroed when they are read in in xfs_strat_read() + * if they weren\'t allocated when the buffer given to xfs_strat_read() + * was mapped. + * + * We keep a list of these attached to the inode. The list is + * protected by the inode lock and the fact that the io lock is + * held exclusively by writers. + */ +typedef struct xfs_gap { + struct xfs_gap *xg_next; + xfs_fileoff_t xg_offset_fsb; + xfs_extlen_t xg_count_fsb; +} xfs_gap_t; + +typedef struct dm_attrs_s { + __uint32_t da_dmevmask; /* DMIG event mask */ + __uint16_t da_dmstate; /* DMIG state info */ + __uint16_t da_pad; /* DMIG extra padding */ +} dm_attrs_t; + +typedef struct xfs_iocore { + void *io_obj; /* pointer to container + * inode or dcxvn structure */ + struct xfs_mount *io_mount; /* fs mount struct ptr */ +#ifdef DEBUG + mrlock_t *io_lock; /* inode IO lock */ + mrlock_t *io_iolock; /* inode IO lock */ +#endif + + /* I/O state */ + xfs_fsize_t io_new_size; /* sz when write completes */ + + /* Miscellaneous state. */ + unsigned int io_flags; /* IO related flags */ + + /* DMAPI state */ + dm_attrs_t io_dmattrs; + +} xfs_iocore_t; + +#define io_dmevmask io_dmattrs.da_dmevmask +#define io_dmstate io_dmattrs.da_dmstate + +#define XFS_IO_INODE(io) ((xfs_inode_t *) ((io)->io_obj)) +#define XFS_IO_DCXVN(io) ((dcxvn_t *) ((io)->io_obj)) + +/* + * Flags in the flags field + */ + +#define XFS_IOCORE_RT 0x1 + +/* + * xfs_iocore prototypes + */ + +extern void xfs_iocore_inode_init(struct xfs_inode *); +extern void xfs_iocore_inode_reinit(struct xfs_inode *); + + +/* + * This is the type used in the xfs inode hash table. + * An array of these is allocated for each mounted + * file system to hash the inodes for that file system. + */ +typedef struct xfs_ihash { + struct xfs_inode *ih_next; + rwlock_t ih_lock; + uint ih_version; +} xfs_ihash_t; + +#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize)) + +/* + * This is the xfs inode cluster hash. This hash is used by xfs_iflush to + * find inodes that share a cluster and can be flushed to disk at the same + * time. + */ +typedef struct xfs_chashlist { + struct xfs_chashlist *chl_next; + struct xfs_inode *chl_ip; + xfs_daddr_t chl_blkno; /* starting block number of + * the cluster */ + struct xfs_buf *chl_buf; /* the inode buffer */ +} xfs_chashlist_t; + +typedef struct xfs_chash { + xfs_chashlist_t *ch_list; + lock_t ch_lock; +} xfs_chash_t; + +#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize)) + + +/* + * This is the xfs in-core inode structure. + * Most of the on-disk inode is embedded in the i_d field. + * + * The extent pointers/inline file space, however, are managed + * separately. The memory for this information is pointed to by + * the if_u1 unions depending on the type of the data. + * This is used to linearize the array of extents for fast in-core + * access. This is used until the file's number of extents + * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers + * are accessed through the buffer cache. + * + * Other state kept in the in-core inode is used for identification, + * locking, transactional updating, etc of the inode. + * + * Generally, we do not want to hold the i_rlock while holding the + * i_ilock. Hierarchy is i_iolock followed by i_rlock. + * + * xfs_iptr_t contains all the inode fields upto and including the + * i_mnext and i_mprev fields, it is used as a marker in the inode + * chain off the mount structure by xfs_sync calls. + */ + +typedef struct { + struct xfs_ihash *ip_hash; /* pointer to hash header */ + struct xfs_inode *ip_next; /* inode hash link forw */ + struct xfs_inode *ip_mnext; /* next inode in mount list */ + struct xfs_inode *ip_mprev; /* ptr to prev inode */ + struct xfs_inode **ip_prevp; /* ptr to prev i_next */ + struct xfs_mount *ip_mount; /* fs mount struct ptr */ +} xfs_iptr_t; + +typedef struct xfs_inode { + /* Inode linking and identification information. */ + struct xfs_ihash *i_hash; /* pointer to hash header */ + struct xfs_inode *i_next; /* inode hash link forw */ + struct xfs_inode *i_mnext; /* next inode in mount list */ + struct xfs_inode *i_mprev; /* ptr to prev inode */ + struct xfs_inode **i_prevp; /* ptr to prev i_next */ + struct xfs_mount *i_mount; /* fs mount struct ptr */ + struct list_head i_reclaim; /* reclaim list */ + struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/ + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + + /* Inode location stuff */ + xfs_ino_t i_ino; /* inode number (agno/agino)*/ + xfs_daddr_t i_blkno; /* blkno of inode buffer */ + ushort i_len; /* len of inode buffer */ + ushort i_boffset; /* off of inode in buffer */ + + /* Extent information. */ + xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t i_df; /* data fork */ + + /* Transaction and locking information. */ + struct xfs_trans *i_transp; /* ptr to owning transaction*/ + struct xfs_inode_log_item *i_itemp; /* logging information */ + mrlock_t i_lock; /* inode lock */ + mrlock_t i_iolock; /* inode IO lock */ + sema_t i_flock; /* inode flush lock */ + atomic_t i_pincount; /* inode pin count */ + wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ +#ifdef HAVE_REFCACHE + struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ + struct xfs_inode *i_release; /* inode to unref */ +#endif + /* I/O state */ + xfs_iocore_t i_iocore; /* I/O core */ + + /* Miscellaneous state. */ + unsigned short i_flags; /* see defined flags below */ + unsigned char i_update_core; /* timestamps/size is dirty */ + unsigned char i_update_size; /* di_size field is dirty */ + unsigned int i_gen; /* generation count */ + unsigned int i_delayed_blks; /* count of delay alloc blks */ + + xfs_dinode_core_t i_d; /* most of ondisk inode */ + xfs_chashlist_t *i_chash; /* cluster hash list header */ + struct xfs_inode *i_cnext; /* cluster hash link forward */ + struct xfs_inode *i_cprev; /* cluster hash link backward */ + + /* Trace buffers per inode. */ +#ifdef XFS_BMAP_TRACE + struct ktrace *i_xtrace; /* inode extent list trace */ +#endif +#ifdef XFS_BMBT_TRACE + struct ktrace *i_btrace; /* inode bmap btree trace */ +#endif +#ifdef XFS_RW_TRACE + struct ktrace *i_rwtrace; /* inode read/write trace */ +#endif +#ifdef XFS_ILOCK_TRACE + struct ktrace *i_lock_trace; /* inode lock/unlock trace */ +#endif +#ifdef XFS_DIR2_TRACE + struct ktrace *i_dir_trace; /* inode directory trace */ +#endif +} xfs_inode_t; + +#endif /* __KERNEL__ */ + + +/* + * Fork handling. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_PTR) +xfs_ifork_t *xfs_ifork_ptr(xfs_inode_t *ip, int w); +#define XFS_IFORK_PTR(ip,w) xfs_ifork_ptr(ip,w) +#else +#define XFS_IFORK_PTR(ip,w) ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_Q) +int xfs_ifork_q(xfs_inode_t *ip); +#define XFS_IFORK_Q(ip) xfs_ifork_q(ip) +#else +#define XFS_IFORK_Q(ip) XFS_CFORK_Q(&(ip)->i_d) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_DSIZE) +int xfs_ifork_dsize(xfs_inode_t *ip); +#define XFS_IFORK_DSIZE(ip) xfs_ifork_dsize(ip) +#else +#define XFS_IFORK_DSIZE(ip) XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_ASIZE) +int xfs_ifork_asize(xfs_inode_t *ip); +#define XFS_IFORK_ASIZE(ip) xfs_ifork_asize(ip) +#else +#define XFS_IFORK_ASIZE(ip) XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_SIZE) +int xfs_ifork_size(xfs_inode_t *ip, int w); +#define XFS_IFORK_SIZE(ip,w) xfs_ifork_size(ip,w) +#else +#define XFS_IFORK_SIZE(ip,w) XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FORMAT) +int xfs_ifork_format(xfs_inode_t *ip, int w); +#define XFS_IFORK_FORMAT(ip,w) xfs_ifork_format(ip,w) +#else +#define XFS_IFORK_FORMAT(ip,w) XFS_CFORK_FORMAT(&ip->i_d, w) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FMT_SET) +void xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n); +#define XFS_IFORK_FMT_SET(ip,w,n) xfs_ifork_fmt_set(ip,w,n) +#else +#define XFS_IFORK_FMT_SET(ip,w,n) XFS_CFORK_FMT_SET(&ip->i_d, w, n) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXTENTS) +int xfs_ifork_nextents(xfs_inode_t *ip, int w); +#define XFS_IFORK_NEXTENTS(ip,w) xfs_ifork_nextents(ip,w) +#else +#define XFS_IFORK_NEXTENTS(ip,w) XFS_CFORK_NEXTENTS(&ip->i_d, w) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXT_SET) +void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n); +#define XFS_IFORK_NEXT_SET(ip,w,n) xfs_ifork_next_set(ip,w,n) +#else +#define XFS_IFORK_NEXT_SET(ip,w,n) XFS_CFORK_NEXT_SET(&ip->i_d, w, n) +#endif + + +#ifdef __KERNEL__ + +/* + * In-core inode flags. + */ +#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */ +#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */ +#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */ +#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */ +#define XFS_ISTALE 0x0010 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ +#define XFS_INEW 0x0040 + +/* + * Flags for inode locking. + */ +#define XFS_IOLOCK_EXCL 0x001 +#define XFS_IOLOCK_SHARED 0x002 +#define XFS_ILOCK_EXCL 0x004 +#define XFS_ILOCK_SHARED 0x008 +#define XFS_IUNLOCK_NONOTIFY 0x010 +#define XFS_EXTENT_TOKEN_RD 0x040 +#define XFS_SIZE_TOKEN_RD 0x080 +#define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD) +#define XFS_WILLLEND 0x100 /* Always acquire tokens for lending */ +#define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND) +#define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) +#define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) + + +#define XFS_LOCK_MASK \ + (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \ + XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \ + XFS_WILLLEND) + +/* + * Flags for xfs_iflush() + */ +#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1 +#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2 +#define XFS_IFLUSH_SYNC 3 +#define XFS_IFLUSH_ASYNC 4 +#define XFS_IFLUSH_DELWRI 5 + +/* + * Flags for xfs_iflush_all. + */ +#define XFS_FLUSH_ALL 0x1 + +/* + * Flags for xfs_itruncate_start(). + */ +#define XFS_ITRUNC_DEFINITE 0x1 +#define XFS_ITRUNC_MAYBE 0x2 + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOV) +struct vnode *xfs_itov(xfs_inode_t *ip); +#define XFS_ITOV(ip) xfs_itov(ip) +#else +#define XFS_ITOV(ip) BHV_TO_VNODE(XFS_ITOBHV(ip)) +#endif +#define XFS_ITOV_NULL(ip) BHV_TO_VNODE_NULL(XFS_ITOBHV(ip)) +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOBHV) +struct bhv_desc *xfs_itobhv(xfs_inode_t *ip); +#define XFS_ITOBHV(ip) xfs_itobhv(ip) +#else +#define XFS_ITOBHV(ip) ((struct bhv_desc *)(&((ip)->i_bhv_desc))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOI) +xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp); +#define XFS_BHVTOI(bhvp) xfs_bhvtoi(bhvp) +#else +#define XFS_BHVTOI(bhvp) \ + ((xfs_inode_t *)((char *)(bhvp) - \ + (char *)&(((xfs_inode_t *)0)->i_bhv_desc))) +#endif + +#define BHV_IS_XFS(bdp) (BHV_OPS(bdp) == &xfs_vnodeops) + +/* + * For multiple groups support: if S_ISGID bit is set in the parent + * directory, group of new file is set to that of the parent, and + * new subdirectory gets S_ISGID bit from parent. + */ +#define XFS_INHERIT_GID(pip, vfsp) \ + (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID)) + +/* + * xfs_iget.c prototypes. + */ + +#define IGET_CREATE 1 + +void xfs_ihash_init(struct xfs_mount *); +void xfs_ihash_free(struct xfs_mount *); +void xfs_chash_init(struct xfs_mount *); +void xfs_chash_free(struct xfs_mount *); +xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, + struct xfs_trans *); +void xfs_inode_lock_init(xfs_inode_t *, struct vnode *); +int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, + uint, uint, xfs_inode_t **, xfs_daddr_t); +void xfs_iput(xfs_inode_t *, uint); +void xfs_iput_new(xfs_inode_t *, uint); +void xfs_ilock(xfs_inode_t *, uint); +int xfs_ilock_nowait(xfs_inode_t *, uint); +void xfs_iunlock(xfs_inode_t *, uint); +void xfs_ilock_demote(xfs_inode_t *, uint); +void xfs_iflock(xfs_inode_t *); +int xfs_iflock_nowait(xfs_inode_t *); +uint xfs_ilock_map_shared(xfs_inode_t *); +void xfs_iunlock_map_shared(xfs_inode_t *, uint); +void xfs_ifunlock(xfs_inode_t *); +void xfs_ireclaim(xfs_inode_t *); +int xfs_finish_reclaim(xfs_inode_t *, int, int); +int xfs_finish_reclaim_all(struct xfs_mount *, int); + +/* + * xfs_inode.c prototypes. + */ +int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, + xfs_dinode_t **, struct xfs_buf **, int *); +int xfs_itobp(struct xfs_mount *, struct xfs_trans *, + xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **, + xfs_daddr_t); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, + xfs_inode_t **, xfs_daddr_t); +int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, nlink_t, + xfs_dev_t, struct cred *, xfs_prid_t, int, + struct xfs_buf **, boolean_t *, xfs_inode_t **); +void xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, + int); +uint xfs_ip2xflags(struct xfs_inode *); +uint xfs_dic2xflags(struct xfs_dinode_core *); +int xfs_ifree(struct xfs_trans *, xfs_inode_t *, + struct xfs_bmap_free *); +void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); +int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, + xfs_fsize_t, int, int); +int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); +int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *); +void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *, + xfs_fsize_t, int); + +void xfs_idestroy_fork(xfs_inode_t *, int); +void xfs_idestroy(xfs_inode_t *); +void xfs_idata_realloc(xfs_inode_t *, int, int); +void xfs_iextract(xfs_inode_t *); +void xfs_iext_realloc(xfs_inode_t *, int, int); +void xfs_iroot_realloc(xfs_inode_t *, int, int); +void xfs_ipin(xfs_inode_t *); +void xfs_iunpin(xfs_inode_t *); +int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int); +int xfs_iflush(xfs_inode_t *, uint); +int xfs_iflush_all(struct xfs_mount *, int); +int xfs_iaccess(xfs_inode_t *, mode_t, cred_t *); +uint xfs_iroundup(uint); +void xfs_ichgtime(xfs_inode_t *, int); +xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); +void xfs_lock_inodes(xfs_inode_t **, int, int, uint); + +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + +#ifdef DEBUG +void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t); +#else /* DEBUG */ +#define xfs_isize_check(mp, ip, isize) +#endif /* DEBUG */ + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +extern struct kmem_zone *xfs_chashlist_zone; +extern struct kmem_zone *xfs_ifork_zone; +extern struct kmem_zone *xfs_inode_zone; +extern struct kmem_zone *xfs_ili_zone; +extern struct vnodeops xfs_vnodeops; + +#endif /* __KERNEL__ */ + +#endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c new file mode 100644 index 00000000000..768cb1816b8 --- /dev/null +++ b/fs/xfs/xfs_inode_item.c @@ -0,0 +1,1092 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains the implementation of the xfs_inode_log_item. + * It contains the item operations used to manipulate the inode log + * items as well as utility routines used by the inode specific + * transaction routines. + */ +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_ag.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_rw.h" + + +kmem_zone_t *xfs_ili_zone; /* inode log item zone */ + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. + */ +STATIC uint +xfs_inode_item_size( + xfs_inode_log_item_t *iip) +{ + uint nvecs; + xfs_inode_t *ip; + + ip = iip->ili_inode; + nvecs = 2; + + /* + * Only log the data/extents/b-tree root if there is something + * left to log. + */ + iip->ili_format.ilf_fields |= XFS_ILOG_CORE; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && + (ip->i_d.di_nextents > 0) && + (ip->i_df.if_bytes > 0)) { + ASSERT(ip->i_df.if_u1.if_extents != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && + (ip->i_df.if_broot_bytes > 0)) { + ASSERT(ip->i_df.if_broot != NULL); + nvecs++; + } else { + ASSERT(!(iip->ili_format.ilf_fields & + XFS_ILOG_DBROOT)); +#ifdef XFS_TRANS_DEBUG + if (iip->ili_root_size > 0) { + ASSERT(iip->ili_root_size == + ip->i_df.if_broot_bytes); + ASSERT(memcmp(iip->ili_orig_root, + ip->i_df.if_broot, + iip->ili_root_size) == 0); + } else { + ASSERT(ip->i_df.if_broot_bytes == 0); + } +#endif + iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; + } + break; + + case XFS_DINODE_FMT_LOCAL: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && + (ip->i_df.if_bytes > 0)) { + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; + } + break; + + case XFS_DINODE_FMT_DEV: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + break; + + case XFS_DINODE_FMT_UUID: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + break; + + default: + ASSERT(0); + break; + } + + /* + * If there are no attributes associated with this file, + * then there cannot be anything more to log. + * Clear all attribute-related log flags. + */ + if (!XFS_IFORK_Q(ip)) { + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + return nvecs; + } + + /* + * Log any necessary attribute data. + */ + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && + (ip->i_d.di_anextents > 0) && + (ip->i_afp->if_bytes > 0)) { + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; + } + break; + + case XFS_DINODE_FMT_BTREE: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && + (ip->i_afp->if_broot_bytes > 0)) { + ASSERT(ip->i_afp->if_broot != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; + } + break; + + case XFS_DINODE_FMT_LOCAL: + iip->ili_format.ilf_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && + (ip->i_afp->if_bytes > 0)) { + ASSERT(ip->i_afp->if_u1.if_data != NULL); + nvecs++; + } else { + iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; + } + break; + + default: + ASSERT(0); + break; + } + + return nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode log item. It fills the first item with an inode + * log format structure, the second with the on-disk inode structure, + * and a possible third and/or fourth with the inode data/extents/b-tree + * root and inode attributes data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + xfs_inode_log_item_t *iip, + xfs_log_iovec_t *log_vector) +{ + uint nvecs; + xfs_log_iovec_t *vecp; + xfs_inode_t *ip; + size_t data_bytes; + xfs_bmbt_rec_t *ext_buffer; + int nrecs; + xfs_mount_t *mp; + + ip = iip->ili_inode; + vecp = log_vector; + + vecp->i_addr = (xfs_caddr_t)&iip->ili_format; + vecp->i_len = sizeof(xfs_inode_log_format_t); + vecp++; + nvecs = 1; + + /* + * Clear i_update_core if the timestamps (or any other + * non-transactional modification) need flushing/logging + * and we're about to log them with the rest of the core. + * + * This is the same logic as xfs_iflush() but this code can't + * run at the same time as xfs_iflush because we're in commit + * processing here and so we have the inode lock held in + * exclusive mode. Although it doesn't really matter + * for the timestamps if both routines were to grab the + * timestamps or not. That would be ok. + * + * We clear i_update_core before copying out the data. + * This is for coordination with our timestamp updates + * that don't hold the inode lock. They will always + * update the timestamps BEFORE setting i_update_core, + * so if we clear i_update_core after they set it we + * are guaranteed to see their updates to the timestamps + * either here. Likewise, if they set it after we clear it + * here, we'll see it either on the next commit of this + * inode or the next time the inode gets flushed via + * xfs_iflush(). This depends on strongly ordered memory + * semantics, but we have that. We use the SYNCHRONIZE + * macro to make sure that the compiler does not reorder + * the i_update_core access below the data copy below. + */ + if (ip->i_update_core) { + ip->i_update_core = 0; + SYNCHRONIZE(); + } + + /* + * We don't have to worry about re-ordering here because + * the update_size field is protected by the inode lock + * and we have that held in exclusive mode. + */ + if (ip->i_update_size) + ip->i_update_size = 0; + + vecp->i_addr = (xfs_caddr_t)&ip->i_d; + vecp->i_len = sizeof(xfs_dinode_core_t); + vecp++; + nvecs++; + iip->ili_format.ilf_fields |= XFS_ILOG_CORE; + + /* + * If this is really an old format inode, then we need to + * log it as such. This means that we have to copy the link + * count from the new field to the old. We don't have to worry + * about the new fields, because nothing trusts them as long as + * the old inode version number is there. If the superblock already + * has a new version number, then we don't bother converting back. + */ + mp = ip->i_mount; + ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || + XFS_SB_VERSION_HASNLINK(&mp->m_sb)); + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + ip->i_d.di_onlink = ip->i_d.di_nlink; + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + } + } + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { + ASSERT(ip->i_df.if_bytes > 0); + ASSERT(ip->i_df.if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_nextents > 0); + ASSERT(iip->ili_extents_buf == NULL); + nrecs = ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nrecs > 0); +#if __BYTE_ORDER == __BIG_ENDIAN + if (nrecs == ip->i_d.di_nextents) { + /* + * There are no delayed allocation + * extents, so just point to the + * real extents array. + */ + vecp->i_addr = + (char *)(ip->i_df.if_u1.if_extents); + vecp->i_len = ip->i_df.if_bytes; + } else +#endif + { + /* + * There are delayed allocation extents + * in the inode, or we need to convert + * the extents to on disk format. + * Use xfs_iextents_copy() + * to copy only the real extents into + * a separate buffer. We'll free the + * buffer in the unlock routine. + */ + ext_buffer = kmem_alloc(ip->i_df.if_bytes, + KM_SLEEP); + iip->ili_extents_buf = ext_buffer; + vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_len = xfs_iextents_copy(ip, ext_buffer, + XFS_DATA_FORK); + } + ASSERT(vecp->i_len <= ip->i_df.if_bytes); + iip->ili_format.ilf_dsize = vecp->i_len; + vecp++; + nvecs++; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { + ASSERT(ip->i_df.if_broot_bytes > 0); + ASSERT(ip->i_df.if_broot != NULL); + vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; + vecp->i_len = ip->i_df.if_broot_bytes; + vecp++; + nvecs++; + iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + } + break; + + case XFS_DINODE_FMT_LOCAL: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { + ASSERT(ip->i_df.if_bytes > 0); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + + vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_df.if_bytes, 4); + ASSERT((ip->i_df.if_real_bytes == 0) || + (ip->i_df.if_real_bytes == data_bytes)); + vecp->i_len = (int)data_bytes; + vecp++; + nvecs++; + iip->ili_format.ilf_dsize = (unsigned)data_bytes; + } + break; + + case XFS_DINODE_FMT_DEV: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DDATA | XFS_ILOG_UUID))); + if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { + iip->ili_format.ilf_u.ilfu_rdev = + ip->i_df.if_u2.if_rdev; + } + break; + + case XFS_DINODE_FMT_UUID: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | + XFS_ILOG_DDATA | XFS_ILOG_DEV))); + if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { + iip->ili_format.ilf_u.ilfu_uuid = + ip->i_df.if_u2.if_uuid; + } + break; + + default: + ASSERT(0); + break; + } + + /* + * If there are no attributes associated with the file, + * then we're done. + * Assert that no attribute-related log flags are set. + */ + if (!XFS_IFORK_Q(ip)) { + ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + iip->ili_format.ilf_size = nvecs; + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); + return; + } + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_anextents > 0); +#ifdef DEBUG + nrecs = ip->i_afp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t); +#endif + ASSERT(nrecs > 0); + ASSERT(nrecs == ip->i_d.di_anextents); +#if __BYTE_ORDER == __BIG_ENDIAN + /* + * There are not delayed allocation extents + * for attributes, so just point at the array. + */ + vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); + vecp->i_len = ip->i_afp->if_bytes; +#else + ASSERT(iip->ili_aextents_buf == NULL); + /* + * Need to endian flip before logging + */ + ext_buffer = kmem_alloc(ip->i_afp->if_bytes, + KM_SLEEP); + iip->ili_aextents_buf = ext_buffer; + vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_len = xfs_iextents_copy(ip, ext_buffer, + XFS_ATTR_FORK); +#endif + iip->ili_format.ilf_asize = vecp->i_len; + vecp++; + nvecs++; + } + break; + + case XFS_DINODE_FMT_BTREE: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { + ASSERT(ip->i_afp->if_broot_bytes > 0); + ASSERT(ip->i_afp->if_broot != NULL); + vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; + vecp->i_len = ip->i_afp->if_broot_bytes; + vecp++; + nvecs++; + iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + } + break; + + case XFS_DINODE_FMT_LOCAL: + ASSERT(!(iip->ili_format.ilf_fields & + (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); + if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + + vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_afp->if_bytes, 4); + ASSERT((ip->i_afp->if_real_bytes == 0) || + (ip->i_afp->if_real_bytes == data_bytes)); + vecp->i_len = (int)data_bytes; + vecp++; + nvecs++; + iip->ili_format.ilf_asize = (unsigned)data_bytes; + } + break; + + default: + ASSERT(0); + break; + } + + ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + iip->ili_format.ilf_size = nvecs; +} + + +/* + * This is called to pin the inode associated with the inode log + * item in memory so it cannot be written out. Do this by calling + * xfs_ipin() to bump the pin count in the inode while holding the + * inode pin lock. + */ +STATIC void +xfs_inode_item_pin( + xfs_inode_log_item_t *iip) +{ + ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + xfs_ipin(iip->ili_inode); +} + + +/* + * This is called to unpin the inode associated with the inode log + * item which was previously pinned with a call to xfs_inode_item_pin(). + * Just call xfs_iunpin() on the inode to do this. + */ +/* ARGSUSED */ +STATIC void +xfs_inode_item_unpin( + xfs_inode_log_item_t *iip, + int stale) +{ + xfs_iunpin(iip->ili_inode); +} + +/* ARGSUSED */ +STATIC void +xfs_inode_item_unpin_remove( + xfs_inode_log_item_t *iip, + xfs_trans_t *tp) +{ + xfs_iunpin(iip->ili_inode); +} + +/* + * This is called to attempt to lock the inode associated with this + * inode log item, in preparation for the push routine which does the actual + * iflush. Don't sleep on the inode lock or the flush lock. + * + * If the flush lock is already held, indicating that the inode has + * been or is in the process of being flushed, then (ideally) we'd like to + * see if the inode's buffer is still incore, and if so give it a nudge. + * We delay doing so until the pushbuf routine, though, to avoid holding + * the AIL lock across a call to the blackhole which is the buffercache. + * Also we don't want to sleep in any device strategy routines, which can happen + * if we do the subsequent bawrite in here. + */ +STATIC uint +xfs_inode_item_trylock( + xfs_inode_log_item_t *iip) +{ + register xfs_inode_t *ip; + + ip = iip->ili_inode; + + if (xfs_ipincount(ip) > 0) { + return XFS_ITEM_PINNED; + } + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + return XFS_ITEM_LOCKED; + } + + if (!xfs_iflock_nowait(ip)) { + /* + * If someone else isn't already trying to push the inode + * buffer, we get to do it. + */ + if (iip->ili_pushbuf_flag == 0) { + iip->ili_pushbuf_flag = 1; +#ifdef DEBUG + iip->ili_push_owner = get_thread_id(); +#endif + /* + * Inode is left locked in shared mode. + * Pushbuf routine gets to unlock it. + */ + return XFS_ITEM_PUSHBUF; + } else { + /* + * We hold the AIL_LOCK, so we must specify the + * NONOTIFY flag so that we won't double trip. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); + return XFS_ITEM_FLUSHING; + } + /* NOTREACHED */ + } + + /* Stale items should force out the iclog */ + if (ip->i_flags & XFS_ISTALE) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); + return XFS_ITEM_PINNED; + } + +#ifdef DEBUG + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ASSERT(iip->ili_format.ilf_fields != 0); + ASSERT(iip->ili_logged == 0); + ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); + } +#endif + return XFS_ITEM_SUCCESS; +} + +/* + * Unlock the inode associated with the inode log item. + * Clear the fields of the inode and inode log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the inode. + */ +STATIC void +xfs_inode_item_unlock( + xfs_inode_log_item_t *iip) +{ + uint hold; + uint iolocked; + uint lock_flags; + xfs_inode_t *ip; + + ASSERT(iip != NULL); + ASSERT(iip->ili_inode->i_itemp != NULL); + ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + ASSERT((!(iip->ili_inode->i_itemp->ili_flags & + XFS_ILI_IOLOCKED_EXCL)) || + ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); + ASSERT((!(iip->ili_inode->i_itemp->ili_flags & + XFS_ILI_IOLOCKED_SHARED)) || + ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); + /* + * Clear the transaction pointer in the inode. + */ + ip = iip->ili_inode; + ip->i_transp = NULL; + + /* + * If the inode needed a separate buffer with which to log + * its extents, then free it now. + */ + if (iip->ili_extents_buf != NULL) { + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); + ASSERT(ip->i_d.di_nextents > 0); + ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); + ASSERT(ip->i_df.if_bytes > 0); + kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); + iip->ili_extents_buf = NULL; + } + if (iip->ili_aextents_buf != NULL) { + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(ip->i_d.di_anextents > 0); + ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); + ASSERT(ip->i_afp->if_bytes > 0); + kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); + iip->ili_aextents_buf = NULL; + } + + /* + * Figure out if we should unlock the inode or not. + */ + hold = iip->ili_flags & XFS_ILI_HOLD; + + /* + * Before clearing out the flags, remember whether we + * are holding the inode's IO lock. + */ + iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; + + /* + * Clear out the fields of the inode log item particular + * to the current transaction. + */ + iip->ili_ilock_recur = 0; + iip->ili_iolock_recur = 0; + iip->ili_flags = 0; + + /* + * Unlock the inode if XFS_ILI_HOLD was not set. + */ + if (!hold) { + lock_flags = XFS_ILOCK_EXCL; + if (iolocked & XFS_ILI_IOLOCKED_EXCL) { + lock_flags |= XFS_IOLOCK_EXCL; + } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { + lock_flags |= XFS_IOLOCK_SHARED; + } + xfs_iput(iip->ili_inode, lock_flags); + } +} + +/* + * This is called to find out where the oldest active copy of the + * inode log item in the on disk log resides now that the last log + * write of it completed at the given lsn. Since we always re-log + * all dirty data in an inode, the latest copy in the on disk log + * is the only one that matters. Therefore, simply return the + * given lsn. + */ +/*ARGSUSED*/ +STATIC xfs_lsn_t +xfs_inode_item_committed( + xfs_inode_log_item_t *iip, + xfs_lsn_t lsn) +{ + return (lsn); +} + +/* + * The transaction with the inode locked has aborted. The inode + * must not be dirty within the transaction (unless we're forcibly + * shutting down). We simply unlock just as if the transaction + * had been cancelled. + */ +STATIC void +xfs_inode_item_abort( + xfs_inode_log_item_t *iip) +{ + xfs_inode_item_unlock(iip); + return; +} + + +/* + * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK + * failed to get the inode flush lock but did get the inode locked SHARED. + * Here we're trying to see if the inode buffer is incore, and if so whether it's + * marked delayed write. If that's the case, we'll initiate a bawrite on that + * buffer to expedite the process. + * + * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, + * so it is inherently race-y. + */ +STATIC void +xfs_inode_item_pushbuf( + xfs_inode_log_item_t *iip) +{ + xfs_inode_t *ip; + xfs_mount_t *mp; + xfs_buf_t *bp; + uint dopush; + + ip = iip->ili_inode; + + ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); + + /* + * The ili_pushbuf_flag keeps others from + * trying to duplicate our effort. + */ + ASSERT(iip->ili_pushbuf_flag != 0); + ASSERT(iip->ili_push_owner == get_thread_id()); + + /* + * If flushlock isn't locked anymore, chances are that the + * inode flush completed and the inode was taken off the AIL. + * So, just get out. + */ + if ((valusema(&(ip->i_flock)) > 0) || + ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { + iip->ili_pushbuf_flag = 0; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return; + } + + mp = ip->i_mount; + bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, + iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); + + if (bp != NULL) { + if (XFS_BUF_ISDELAYWRITE(bp)) { + /* + * We were racing with iflush because we don't hold + * the AIL_LOCK or the flush lock. However, at this point, + * we have the buffer, and we know that it's dirty. + * So, it's possible that iflush raced with us, and + * this item is already taken off the AIL. + * If not, we can flush it async. + */ + dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && + (valusema(&(ip->i_flock)) <= 0)); + iip->ili_pushbuf_flag = 0; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_buftrace("INODE ITEM PUSH", bp); + if (XFS_BUF_ISPINNED(bp)) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE); + } + if (dopush) { + xfs_bawrite(mp, bp); + } else { + xfs_buf_relse(bp); + } + } else { + iip->ili_pushbuf_flag = 0; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_buf_relse(bp); + } + return; + } + /* + * We have to be careful about resetting pushbuf flag too early (above). + * Even though in theory we can do it as soon as we have the buflock, + * we don't want others to be doing work needlessly. They'll come to + * this function thinking that pushing the buffer is their + * responsibility only to find that the buffer is still locked by + * another doing the same thing + */ + iip->ili_pushbuf_flag = 0; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return; +} + + +/* + * This is called to asynchronously write the inode associated with this + * inode log item out to disk. The inode will already have been locked by + * a successful call to xfs_inode_item_trylock(). + */ +STATIC void +xfs_inode_item_push( + xfs_inode_log_item_t *iip) +{ + xfs_inode_t *ip; + + ip = iip->ili_inode; + + ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); + ASSERT(valusema(&(ip->i_flock)) <= 0); + /* + * Since we were able to lock the inode's flush lock and + * we found it on the AIL, the inode must be dirty. This + * is because the inode is removed from the AIL while still + * holding the flush lock in xfs_iflush_done(). Thus, if + * we found it in the AIL and were able to obtain the flush + * lock without sleeping, then there must not have been + * anyone in the process of flushing the inode. + */ + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || + iip->ili_format.ilf_fields != 0); + + /* + * Write out the inode. The completion routine ('iflush_done') will + * pull it from the AIL, mark it clean, unlock the flush lock. + */ + (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return; +} + +/* + * XXX rcc - this one really has to do something. Probably needs + * to stamp in a new field in the incore inode. + */ +/* ARGSUSED */ +STATIC void +xfs_inode_item_committing( + xfs_inode_log_item_t *iip, + xfs_lsn_t lsn) +{ + iip->ili_last_lsn = lsn; + return; +} + +/* + * This is the ops vector shared by all buf log items. + */ +struct xfs_item_ops xfs_inode_item_ops = { + .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, + .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) + xfs_inode_item_format, + .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, + .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, + .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) + xfs_inode_item_unpin_remove, + .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, + .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, + .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_inode_item_committed, + .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, + .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, + .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, + .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) + xfs_inode_item_committing +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + */ +void +xfs_inode_item_init( + xfs_inode_t *ip, + xfs_mount_t *mp) +{ + xfs_inode_log_item_t *iip; + + ASSERT(ip->i_itemp == NULL); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + + iip->ili_item.li_type = XFS_LI_INODE; + iip->ili_item.li_ops = &xfs_inode_item_ops; + iip->ili_item.li_mountp = mp; + iip->ili_inode = ip; + + /* + We have zeroed memory. No need ... + iip->ili_extents_buf = NULL; + iip->ili_pushbuf_flag = 0; + */ + + iip->ili_format.ilf_type = XFS_LI_INODE; + iip->ili_format.ilf_ino = ip->i_ino; + iip->ili_format.ilf_blkno = ip->i_blkno; + iip->ili_format.ilf_len = ip->i_len; + iip->ili_format.ilf_boffset = ip->i_boffset; +} + +/* + * Free the inode log item and any memory hanging off of it. + */ +void +xfs_inode_item_destroy( + xfs_inode_t *ip) +{ +#ifdef XFS_TRANS_DEBUG + if (ip->i_itemp->ili_root_size != 0) { + kmem_free(ip->i_itemp->ili_orig_root, + ip->i_itemp->ili_root_size); + } +#endif + kmem_zone_free(xfs_ili_zone, ip->i_itemp); +} + + +/* + * This is the inode flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the inode is + * flushed to disk. It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + */ +/*ARGSUSED*/ +void +xfs_iflush_done( + xfs_buf_t *bp, + xfs_inode_log_item_t *iip) +{ + xfs_inode_t *ip; + SPLDECL(s); + + ip = iip->ili_inode; + + /* + * We only want to pull the item from the AIL if it is + * actually there and its location in the log has not + * changed since we started the flush. Thus, we only bother + * if the ili_logged flag is set and the inode's lsn has not + * changed. First we check the lsn outside + * the lock since it's cheaper, and then we recheck while + * holding the lock before removing the inode from the AIL. + */ + if (iip->ili_logged && + (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { + AIL_LOCK(ip->i_mount, s); + if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(ip->i_mount, + (xfs_log_item_t*)iip, s); + } else { + AIL_UNLOCK(ip->i_mount, s); + } + } + + iip->ili_logged = 0; + + /* + * Clear the ili_last_fields bits now that we know that the + * data corresponding to them is safely on disk. + */ + iip->ili_last_fields = 0; + + /* + * Release the inode's flush lock since we're done with it. + */ + xfs_ifunlock(ip); + + return; +} + +/* + * This is the inode flushing abort routine. It is called + * from xfs_iflush when the filesystem is shutting down to clean + * up the inode state. + * It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + */ +void +xfs_iflush_abort( + xfs_inode_t *ip) +{ + xfs_inode_log_item_t *iip; + xfs_mount_t *mp; + SPLDECL(s); + + iip = ip->i_itemp; + mp = ip->i_mount; + if (iip) { + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + AIL_LOCK(mp, s); + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + /* + * xfs_trans_delete_ail() drops the AIL lock. + */ + xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip, + s); + } else + AIL_UNLOCK(mp, s); + } + iip->ili_logged = 0; + /* + * Clear the ili_last_fields bits now that we know that the + * data corresponding to them is safely on disk. + */ + iip->ili_last_fields = 0; + /* + * Clear the inode logging fields so no more flushes are + * attempted. + */ + iip->ili_format.ilf_fields = 0; + } + /* + * Release the inode's flush lock since we're done with it. + */ + xfs_ifunlock(ip); +} + +void +xfs_istale_done( + xfs_buf_t *bp, + xfs_inode_log_item_t *iip) +{ + xfs_iflush_abort(iip->ili_inode); +} diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h new file mode 100644 index 00000000000..d8775e0d629 --- /dev/null +++ b/fs/xfs/xfs_inode_item.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_INODE_ITEM_H__ +#define __XFS_INODE_ITEM_H__ + +/* + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + * + * Convention for naming inode log item versions : The current version + * is always named XFS_LI_INODE. When an inode log item gets superseded, + * add the latest version of IRIX that will generate logs with that item + * to the version name. + * + * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first + * union (ilf_u) field. This was released with IRIX 5.3-XFS. + * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire + * structure. This was released with IRIX 6.0.1-XFS and IRIX 6.1. + * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2 + * so a new structure definition wasn't necessary. However, we had + * to add a new type because the inode cluster size changed from 4K + * to 8K and the version number had to be rev'ved to keep older kernels + * from trying to recover logs with the 8K buffers in them. The logging + * code can handle recovery on different-sized clusters now so hopefully + * this'll be the last time we need to change the inode log item just + * for a change in the inode cluster size. This new version was + * released with IRIX 6.2. + */ +typedef struct xfs_inode_log_format { + unsigned short ilf_type; /* inode log item type */ + unsigned short ilf_size; /* size of this item */ + uint ilf_fields; /* flags for fields logged */ + ushort ilf_asize; /* size of attr d/ext/root */ + ushort ilf_dsize; /* size of data/ext/root */ + xfs_ino_t ilf_ino; /* inode number */ + union { + xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + int ilf_len; /* len of inode buffer */ + int ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +/* Initial version shipped with IRIX 5.3-XFS */ +typedef struct xfs_inode_log_format_v1 { + unsigned short ilf_type; /* inode log item type */ + unsigned short ilf_size; /* size of this item */ + uint ilf_fields; /* flags for fields logged */ + uint ilf_dsize; /* size of data/ext/root */ + xfs_ino_t ilf_ino; /* inode number */ + union { + xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; +} xfs_inode_log_format_t_v1; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILI_HOLD 0x1 +#define XFS_ILI_IOLOCKED_EXCL 0x2 +#define XFS_ILI_IOLOCKED_SHARED 0x4 + +#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) + + +#ifdef __KERNEL__ + +struct xfs_buf; +struct xfs_bmbt_rec_64; +struct xfs_inode; +struct xfs_mount; + + +typedef struct xfs_inode_log_item { + xfs_log_item_t ili_item; /* common portion */ + struct xfs_inode *ili_inode; /* inode ptr */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + unsigned short ili_ilock_recur; /* lock recursion count */ + unsigned short ili_iolock_recur; /* lock recursion count */ + unsigned short ili_flags; /* misc flags */ + unsigned short ili_logged; /* flushed logged data */ + unsigned int ili_last_fields; /* fields when flushed */ + struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged + data exts */ + struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged + attr exts */ + unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ + +#ifdef DEBUG + uint64_t ili_push_owner; /* one who sets pushbuf_flag + above gets to push the buf */ +#endif +#ifdef XFS_TRANS_DEBUG + int ili_root_size; + char *ili_orig_root; +#endif + xfs_inode_log_format_t ili_format; /* logged structure */ +} xfs_inode_log_item_t; + + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FDATA) +int xfs_ilog_fdata(int w); +#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) +#else +#define XFS_ILOG_FDATA(w) \ + ((w) == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA) +#endif + +#endif /* __KERNEL__ */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FBROOT) +int xfs_ilog_fbroot(int w); +#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) +#else +#define XFS_ILOG_FBROOT(w) \ + ((w) == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FEXT) +int xfs_ilog_fext(int w); +#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) +#else +#define XFS_ILOG_FEXT(w) \ + ((w) == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT) +#endif + +#ifdef __KERNEL__ + +void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); +void xfs_inode_item_destroy(struct xfs_inode *); +void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); +void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); +void xfs_iflush_abort(struct xfs_inode *); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h new file mode 100644 index 00000000000..a3af2d5a6eb --- /dev/null +++ b/fs/xfs/xfs_inum.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_INUM_H__ +#define __XFS_INUM_H__ + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ + +typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ + +/* + * Useful inode bits for this kernel. + * Used in some places where having 64-bits in the 32-bit kernels + * costs too much. + */ +#if XFS_BIG_INUMS +typedef xfs_ino_t xfs_intino_t; +#else +typedef __uint32_t xfs_intino_t; +#endif + +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + +struct xfs_mount; + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_MASK) +__uint32_t xfs_ino_mask(int k); +#define XFS_INO_MASK(k) xfs_ino_mask(k) +#else +#define XFS_INO_MASK(k) ((__uint32_t)((1ULL << (k)) - 1)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_OFFSET_BITS) +int xfs_ino_offset_bits(struct xfs_mount *mp); +#define XFS_INO_OFFSET_BITS(mp) xfs_ino_offset_bits(mp) +#else +#define XFS_INO_OFFSET_BITS(mp) ((mp)->m_sb.sb_inopblog) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGBNO_BITS) +int xfs_ino_agbno_bits(struct xfs_mount *mp); +#define XFS_INO_AGBNO_BITS(mp) xfs_ino_agbno_bits(mp) +#else +#define XFS_INO_AGBNO_BITS(mp) ((mp)->m_sb.sb_agblklog) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGINO_BITS) +int xfs_ino_agino_bits(struct xfs_mount *mp); +#define XFS_INO_AGINO_BITS(mp) xfs_ino_agino_bits(mp) +#else +#define XFS_INO_AGINO_BITS(mp) ((mp)->m_agino_log) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGNO_BITS) +int xfs_ino_agno_bits(struct xfs_mount *mp); +#define XFS_INO_AGNO_BITS(mp) xfs_ino_agno_bits(mp) +#else +#define XFS_INO_AGNO_BITS(mp) ((mp)->m_agno_log) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_BITS) +int xfs_ino_bits(struct xfs_mount *mp); +#define XFS_INO_BITS(mp) xfs_ino_bits(mp) +#else +#define XFS_INO_BITS(mp) (XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGNO) +xfs_agnumber_t xfs_ino_to_agno(struct xfs_mount *mp, xfs_ino_t i); +#define XFS_INO_TO_AGNO(mp,i) xfs_ino_to_agno(mp,i) +#else +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGINO) +xfs_agino_t xfs_ino_to_agino(struct xfs_mount *mp, xfs_ino_t i); +#define XFS_INO_TO_AGINO(mp,i) xfs_ino_to_agino(mp,i) +#else +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGBNO) +xfs_agblock_t xfs_ino_to_agbno(struct xfs_mount *mp, xfs_ino_t i); +#define XFS_INO_TO_AGBNO(mp,i) xfs_ino_to_agbno(mp,i) +#else +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_OFFSET) +int xfs_ino_to_offset(struct xfs_mount *mp, xfs_ino_t i); +#define XFS_INO_TO_OFFSET(mp,i) xfs_ino_to_offset(mp,i) +#else +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_FSB) +xfs_fsblock_t xfs_ino_to_fsb(struct xfs_mount *mp, xfs_ino_t i); +#define XFS_INO_TO_FSB(mp,i) xfs_ino_to_fsb(mp,i) +#else +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_INO) +xfs_ino_t +xfs_agino_to_ino(struct xfs_mount *mp, xfs_agnumber_t a, xfs_agino_t i); +#define XFS_AGINO_TO_INO(mp,a,i) xfs_agino_to_ino(mp,a,i) +#else +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_AGBNO) +xfs_agblock_t xfs_agino_to_agbno(struct xfs_mount *mp, xfs_agino_t i); +#define XFS_AGINO_TO_AGBNO(mp,i) xfs_agino_to_agbno(mp,i) +#else +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_OFFSET) +int xfs_agino_to_offset(struct xfs_mount *mp, xfs_agino_t i); +#define XFS_AGINO_TO_OFFSET(mp,i) xfs_agino_to_offset(mp,i) +#else +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_OFFBNO_TO_AGINO) +xfs_agino_t xfs_offbno_to_agino(struct xfs_mount *mp, xfs_agblock_t b, int o); +#define XFS_OFFBNO_TO_AGINO(mp,b,o) xfs_offbno_to_agino(mp,b,o) +#else +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) +#endif + +#if XFS_BIG_INUMS +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32)) +#else +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) +#endif +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +#endif /* __XFS_INUM_H__ */ diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c new file mode 100644 index 00000000000..414ec496845 --- /dev/null +++ b/fs/xfs/xfs_iocore.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_rw.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_iomap.h" + + +STATIC xfs_fsize_t +xfs_size_fn( + xfs_inode_t *ip) +{ + return (ip->i_d.di_size); +} + +STATIC int +xfs_ioinit( + struct vfs *vfsp, + struct xfs_mount_args *mntargs, + int flags) +{ + return xfs_mountfs(vfsp, XFS_VFSTOM(vfsp), flags); +} + +xfs_ioops_t xfs_iocore_xfs = { + .xfs_ioinit = (xfs_ioinit_t) xfs_ioinit, + .xfs_bmapi_func = (xfs_bmapi_t) xfs_bmapi, + .xfs_bmap_eof_func = (xfs_bmap_eof_t) xfs_bmap_eof, + .xfs_iomap_write_direct = + (xfs_iomap_write_direct_t) xfs_iomap_write_direct, + .xfs_iomap_write_delay = + (xfs_iomap_write_delay_t) xfs_iomap_write_delay, + .xfs_iomap_write_allocate = + (xfs_iomap_write_allocate_t) xfs_iomap_write_allocate, + .xfs_iomap_write_unwritten = + (xfs_iomap_write_unwritten_t) xfs_iomap_write_unwritten, + .xfs_ilock = (xfs_lock_t) xfs_ilock, + .xfs_lck_map_shared = (xfs_lck_map_shared_t) xfs_ilock_map_shared, + .xfs_ilock_demote = (xfs_lock_demote_t) xfs_ilock_demote, + .xfs_ilock_nowait = (xfs_lock_nowait_t) xfs_ilock_nowait, + .xfs_unlock = (xfs_unlk_t) xfs_iunlock, + .xfs_size_func = (xfs_size_t) xfs_size_fn, + .xfs_iodone = (xfs_iodone_t) fs_noerr, +}; + +void +xfs_iocore_inode_reinit( + xfs_inode_t *ip) +{ + xfs_iocore_t *io = &ip->i_iocore; + + io->io_flags = 0; + if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) + io->io_flags |= XFS_IOCORE_RT; + io->io_dmevmask = ip->i_d.di_dmevmask; + io->io_dmstate = ip->i_d.di_dmstate; +} + +void +xfs_iocore_inode_init( + xfs_inode_t *ip) +{ + xfs_iocore_t *io = &ip->i_iocore; + xfs_mount_t *mp = ip->i_mount; + + io->io_mount = mp; +#ifdef DEBUG + io->io_lock = &ip->i_lock; + io->io_iolock = &ip->i_iolock; +#endif + + io->io_obj = (void *)ip; + + xfs_iocore_inode_reinit(ip); +} diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c new file mode 100644 index 00000000000..3826e8f0e28 --- /dev/null +++ b/fs/xfs/xfs_iomap.c @@ -0,0 +1,1000 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" + +#if defined(XFS_RW_TRACE) +void +xfs_iomap_enter_trace( + int tag, + xfs_iocore_t *io, + xfs_off_t offset, + ssize_t count) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (!ip->i_rwtrace) + return; + + ktrace_enter(ip->i_rwtrace, + (void *)((unsigned long)tag), + (void *)ip, + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)count), + (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} + +void +xfs_iomap_map_trace( + int tag, + xfs_iocore_t *io, + xfs_off_t offset, + ssize_t count, + xfs_iomap_t *iomapp, + xfs_bmbt_irec_t *imapp, + int flags) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (!ip->i_rwtrace) + return; + + ktrace_enter(ip->i_rwtrace, + (void *)((unsigned long)tag), + (void *)ip, + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)count), + (void *)((unsigned long)flags), + (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)), + (void *)((unsigned long)(iomapp->iomap_delta)), + (void *)((unsigned long)(iomapp->iomap_bsize)), + (void *)((unsigned long)(iomapp->iomap_bn)), + (void *)(__psint_t)(imapp->br_startoff), + (void *)((unsigned long)(imapp->br_blockcount)), + (void *)(__psint_t)(imapp->br_startblock)); +} +#else +#define xfs_iomap_enter_trace(tag, io, offset, count) +#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags) +#endif + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_STRAT_WRITE_IMAPS 2 +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + +STATIC int +xfs_imap_to_bmap( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + xfs_iomap_t *iomapp, + int imaps, /* Number of imap entries */ + int iomaps, /* Number of iomap entries */ + int flags) +{ + xfs_mount_t *mp; + xfs_fsize_t nisize; + int pbm; + xfs_fsblock_t start_block; + + mp = io->io_mount; + nisize = XFS_SIZE(mp, io); + if (io->io_new_size > nisize) + nisize = io->io_new_size; + + for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { + iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomapp->iomap_delta = offset - iomapp->iomap_offset; + iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomapp->iomap_flags = flags; + + if (io->io_flags & XFS_IOCORE_RT) { + iomapp->iomap_flags |= IOMAP_REALTIME; + iomapp->iomap_target = mp->m_rtdev_targp; + } else { + iomapp->iomap_target = mp->m_ddev_targp; + } + start_block = imap->br_startblock; + if (start_block == HOLESTARTBLOCK) { + iomapp->iomap_bn = IOMAP_DADDR_NULL; + iomapp->iomap_flags |= IOMAP_HOLE; + } else if (start_block == DELAYSTARTBLOCK) { + iomapp->iomap_bn = IOMAP_DADDR_NULL; + iomapp->iomap_flags |= IOMAP_DELAY; + } else { + iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block); + if (ISUNWRITTEN(imap)) + iomapp->iomap_flags |= IOMAP_UNWRITTEN; + } + + if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) { + iomapp->iomap_flags |= IOMAP_EOF; + } + + offset += iomapp->iomap_bsize - iomapp->iomap_delta; + } + return pbm; /* Return the number filled */ +} + +int +xfs_iomap( + xfs_iocore_t *io, + xfs_off_t offset, + ssize_t count, + int flags, + xfs_iomap_t *iomapp, + int *niomaps) +{ + xfs_mount_t *mp = io->io_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int bmapi_flags = 0; + int iomap_flags = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + switch (flags & + (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE | + BMAPI_UNWRITTEN | BMAPI_DEVICE)) { + case BMAPI_READ: + xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); + lockmode = XFS_LCK_MAP_SHARED(mp, io); + bmapi_flags = XFS_BMAPI_ENTIRE; + if (flags & BMAPI_IGNSTATE) + bmapi_flags |= XFS_BMAPI_IGSTATE; + break; + case BMAPI_WRITE: + xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); + lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; + bmapi_flags = 0; + XFS_ILOCK(mp, io, lockmode); + break; + case BMAPI_ALLOCATE: + xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count); + lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; + bmapi_flags = XFS_BMAPI_ENTIRE; + /* Attempt non-blocking lock */ + if (flags & BMAPI_TRYLOCK) { + if (!XFS_ILOCK_NOWAIT(mp, io, lockmode)) + return XFS_ERROR(EAGAIN); + } else { + XFS_ILOCK(mp, io, lockmode); + } + break; + case BMAPI_UNWRITTEN: + goto phase2; + case BMAPI_DEVICE: + lockmode = XFS_LCK_MAP_SHARED(mp, io); + iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? + mp->m_rtdev_targp : mp->m_ddev_targp; + error = 0; + *niomaps = 1; + goto out; + default: + BUG(); + } + + ASSERT(offset <= mp->m_maxioffset); + if ((xfs_fsize_t)offset + count > mp->m_maxioffset) + count = mp->m_maxioffset - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = XFS_BMAPI(mp, NULL, io, offset_fsb, + (xfs_filblks_t)(end_fsb - offset_fsb), + bmapi_flags, NULL, 0, &imap, + &nimaps, NULL); + + if (error) + goto out; + +phase2: + switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) { + case BMAPI_WRITE: + /* If we found an extent, return it */ + if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) { + xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, + offset, count, iomapp, &imap, flags); + break; + } + + if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { + error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset, + count, flags, &imap, &nimaps, nimaps); + } else { + error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, + flags, &imap, &nimaps); + } + if (!error) { + xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io, + offset, count, iomapp, &imap, flags); + } + iomap_flags = IOMAP_NEW; + break; + case BMAPI_ALLOCATE: + /* If we found an extent, return it */ + XFS_IUNLOCK(mp, io, lockmode); + lockmode = 0; + + if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { + xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, + offset, count, iomapp, &imap, flags); + break; + } + + error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps); + break; + case BMAPI_UNWRITTEN: + lockmode = 0; + error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count); + nimaps = 0; + break; + } + + if (nimaps) { + *niomaps = xfs_imap_to_bmap(io, offset, &imap, + iomapp, nimaps, *niomaps, iomap_flags); + } else if (niomaps) { + *niomaps = 0; + } + +out: + if (lockmode) + XFS_IUNLOCK(mp, io, lockmode); + return XFS_ERROR(error); +} + +STATIC int +xfs_flush_space( + xfs_inode_t *ip, + int *fsynced, + int *ioflags) +{ + switch (*fsynced) { + case 0: + if (ip->i_delayed_blks) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inode(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + *fsynced = 1; + } else { + *ioflags |= BMAPI_SYNC; + *fsynced = 2; + } + return 0; + case 1: + *fsynced = 2; + *ioflags |= BMAPI_SYNC; + return 0; + case 2: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_device(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + *fsynced = 3; + return 0; + } + return 1; +} + +int +xfs_iomap_write_direct( + xfs_inode_t *ip, + loff_t offset, + size_t count, + int flags, + xfs_bmbt_irec_t *ret_imap, + int *nmaps, + int found) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb; + xfs_fsize_t isize; + xfs_fsblock_t firstfsb; + int nimaps, maps; + int error; + int bmapi_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp; + xfs_bmap_free_t free_list; + int aeof; + xfs_filblks_t datablocks; + int committed; + int numrtextents; + uint resblks; + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); + if (error) + return XFS_ERROR(error); + + maps = min(XFS_WRITE_IMAPS, *nmaps); + nimaps = maps; + + isize = ip->i_d.di_size; + aeof = (offset + count) > isize; + + if (io->io_new_size > isize) + isize = io->io_new_size; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + count_fsb = last_fsb - offset_fsb; + if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) { + xfs_fileoff_t map_last_fsb; + + map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff; + + if (map_last_fsb < last_fsb) { + last_fsb = map_last_fsb; + count_fsb = last_fsb - offset_fsb; + } + ASSERT(count_fsb > 0); + } + + /* + * determine if reserving space on + * the data or realtime partition. + */ + if ((rt = XFS_IS_REALTIME_INODE(ip))) { + int sbrtextsize, iprtextsize; + + sbrtextsize = mp->m_sb.sb_rextsize; + iprtextsize = + ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize; + numrtextents = (count_fsb + iprtextsize - 1); + do_div(numrtextents, sbrtextsize); + datablocks = 0; + } else { + datablocks = count_fsb; + numrtextents = 0; + } + + /* + * allocate and setup the transaction + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + + resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); + + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), numrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) + /* + * Free the transaction structure. + */ + xfs_trans_cancel(tp, 0); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + goto error_out; /* Don't return in above if .. trans .., + need lock to return */ + + if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) { + error = (EDQUOT); + goto error1; + } + nimaps = 1; + + bmapi_flag = XFS_BMAPI_WRITE; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt)) + bmapi_flag |= XFS_BMAPI_PREALLOC; + + /* + * issue the bmapi() call to allocate the blocks + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + imapp = &imap[0]; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, + bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + + error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + goto error_out; + } + + /* copy any maps to caller's array and return any error. */ + if (nimaps == 0) { + error = (ENOSPC); + goto error_out; + } + + *ret_imap = imap[0]; + *nmaps = 1; + if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { + cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " + "start_block : %llx start_off : %llx blkcnt : %llx " + "extent-state : %x \n", + (ip->i_mount)->m_fsname, + (long long)ip->i_ino, + ret_imap->br_startblock, ret_imap->br_startoff, + ret_imap->br_blockcount,ret_imap->br_state); + } + return 0; + + error0: /* Cancel bmap, unlock inode, and cancel trans */ + xfs_bmap_cancel(&free_list); + + error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + *nmaps = 0; /* nothing set-up here */ + +error_out: + return XFS_ERROR(error); +} + +int +xfs_iomap_write_delay( + xfs_inode_t *ip, + loff_t offset, + size_t count, + int ioflag, + xfs_bmbt_irec_t *ret_imap, + int *nmaps) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_fsize_t isize; + xfs_fsblock_t firstblock; + int nimaps; + int error; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int aeof; + int fsynced = 0; + + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + + error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); + if (error) + return XFS_ERROR(error); + +retry: + isize = ip->i_d.di_size; + if (io->io_new_size > isize) { + isize = io->io_new_size; + } + + aeof = 0; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + /* + * If the caller is doing a write at the end of the file, + * then extend the allocation (and the buffer used for the write) + * out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * + * For sync writes, we are flushing delayed allocate space to + * try to make additional space available for allocation near + * the filesystem full boundary - preallocation hurts in that + * situation, of course. + */ + if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) { + xfs_off_t aligned_offset; + xfs_filblks_t count_fsb; + unsigned int iosize; + xfs_fileoff_t ioalign; + int n; + xfs_fileoff_t start_fsb; + + /* + * If there are any real blocks past eof, then don't + * do any speculative allocation. + */ + start_fsb = XFS_B_TO_FSBT(mp, + ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + while (count_fsb > 0) { + nimaps = XFS_WRITE_IMAPS; + error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, + 0, &firstblock, 0, imap, &nimaps, NULL); + if (error) { + return error; + } + for (n = 0; n < nimaps; n++) { + if ( !(io->io_flags & XFS_IOCORE_RT) && + !imap[n].br_startblock) { + cmn_err(CE_PANIC,"Access to block " + "zero: fs <%s> inode: %lld " + "start_block : %llx start_off " + ": %llx blkcnt : %llx " + "extent-state : %x \n", + (ip->i_mount)->m_fsname, + (long long)ip->i_ino, + imap[n].br_startblock, + imap[n].br_startoff, + imap[n].br_blockcount, + imap[n].br_state); + } + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) { + goto write_map; + } + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + } + } + iosize = mp->m_writeio_blocks; + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + iosize; + aeof = 1; + } +write_map: + nimaps = XFS_WRITE_IMAPS; + firstblock = NULLFSBLOCK; + + /* + * If mounted with the "-o swalloc" option, roundup the allocation + * request to a stripe width boundary if the file size is >= + * stripe width and we are allocating past the allocation eof. + */ + if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth + && (mp->m_flags & XFS_MOUNT_SWALLOC) + && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + + new_last_fsb = roundup_64(last_fsb, mp->m_swidth); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + return error; + } + if (eof) { + last_fsb = new_last_fsb; + } + /* + * Roundup the allocation request to a stripe unit (m_dalign) boundary + * if the file size is >= stripe unit size, and we are allocating past + * the allocation eof. + */ + } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign && + (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + new_last_fsb = roundup_64(last_fsb, mp->m_dalign); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + return error; + } + if (eof) { + last_fsb = new_last_fsb; + } + /* + * Round up the allocation request to a real-time extent boundary + * if the file is on the real-time subvolume. + */ + } else if (io->io_flags & XFS_IOCORE_RT && aeof) { + int eof; + xfs_fileoff_t new_last_fsb; + + new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize); + error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) { + return error; + } + if (eof) + last_fsb = new_last_fsb; + } + error = xfs_bmapi(NULL, ip, offset_fsb, + (xfs_filblks_t)(last_fsb - offset_fsb), + XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | + XFS_BMAPI_ENTIRE, &firstblock, 1, imap, + &nimaps, NULL); + /* + * This can be EDQUOT, if nimaps == 0 + */ + if (error && (error != ENOSPC)) { + return XFS_ERROR(error); + } + /* + * If bmapi returned us nothing, and if we didn't get back EDQUOT, + * then we must have run out of space. + */ + if (nimaps == 0) { + xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, + io, offset, count); + if (xfs_flush_space(ip, &fsynced, &ioflag)) + return XFS_ERROR(ENOSPC); + + error = 0; + goto retry; + } + + *ret_imap = imap[0]; + *nmaps = 1; + if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { + cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " + "start_block : %llx start_off : %llx blkcnt : %llx " + "extent-state : %x \n", + (ip->i_mount)->m_fsname, + (long long)ip->i_ino, + ret_imap->br_startblock, ret_imap->br_startoff, + ret_imap->br_blockcount,ret_imap->br_state); + } + return 0; +} + +/* + * Pass in a delayed allocate extent, convert it to real extents; + * return to the caller the extent we create which maps on top of + * the originating callers request. + * + * Called without a lock on the inode. + */ +int +xfs_iomap_write_allocate( + xfs_inode_t *ip, + xfs_bmbt_irec_t *map, + int *retmap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_fileoff_t offset_fsb, last_block; + xfs_fileoff_t end_fsb, map_start_fsb; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; + xfs_trans_t *tp; + int i, nimaps, committed; + int error = 0; + int nres; + + *retmap = 0; + + /* + * Make sure that the dquots are there. + */ + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return XFS_ERROR(error); + + offset_fsb = map->br_startoff; + count_fsb = map->br_blockcount; + map_start_fsb = offset_fsb; + + XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + + nimaps = 0; + while (nimaps == 0) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_reserve(tp, nres, + XFS_WRITE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error == ENOSPC) { + error = xfs_trans_reserve(tp, 0, + XFS_WRITE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + } + if (error) { + xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + XFS_BMAP_INIT(&free_list, &first_block); + + nimaps = XFS_STRAT_WRITE_IMAPS; + /* + * Ensure we don't go beyond eof - it is possible + * the extents changed since we did the read call, + * we dropped the ilock in the interim. + */ + + end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); + xfs_bmap_last_offset(NULL, ip, &last_block, + XFS_DATA_FORK); + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + error = EAGAIN; + goto trans_cancel; + } + } + + /* Go get the actual blocks */ + error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, + XFS_BMAPI_WRITE, &first_block, 1, + imap, &nimaps, &free_list); + if (error) + goto trans_cancel; + + error = xfs_bmap_finish(&tp, &free_list, + first_block, &committed); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) + goto error0; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the callers request + */ + + for (i = 0; i < nimaps; i++) { + if ( !(io->io_flags & XFS_IOCORE_RT) && + !imap[i].br_startblock) { + cmn_err(CE_PANIC,"Access to block zero: " + "fs <%s> inode: %lld " + "start_block : %llx start_off : %llx " + "blkcnt : %llx extent-state : %x \n", + (ip->i_mount)->m_fsname, + (long long)ip->i_ino, + imap[i].br_startblock, + imap[i].br_startoff, + imap[i].br_blockcount,imap[i].br_state); + } + if ((map->br_startoff >= imap[i].br_startoff) && + (map->br_startoff < (imap[i].br_startoff + + imap[i].br_blockcount))) { + *map = imap[i]; + *retmap = 1; + XFS_STATS_INC(xs_xstrat_quick); + return 0; + } + count_fsb -= imap[i].br_blockcount; + } + + /* So far we have not mapped the requested part of the + * file, just surrounding data, try again. + */ + nimaps--; + offset_fsb = imap[nimaps].br_startoff + + imap[nimaps].br_blockcount; + map_start_fsb = offset_fsb; + } + +trans_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return XFS_ERROR(error); +} + +int +xfs_iomap_write_unwritten( + xfs_inode_t *ip, + loff_t offset, + size_t count) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_iocore_t *io = &ip->i_iocore; + xfs_trans_t *tp; + xfs_fileoff_t offset_fsb; + xfs_filblks_t count_fsb; + xfs_filblks_t numblks_fsb; + xfs_bmbt_irec_t imap; + int committed; + int error; + int nres; + int nimaps; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + + xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, + &ip->i_iocore, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + + do { + nres = XFS_DIOSTRAT_SPACE_RES(mp, 0); + + /* + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + error = xfs_trans_reserve(tp, nres, + XFS_WRITE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto error0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * Modify the unwritten extent state of the buffer. + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_WRITE, &firstfsb, + 1, &imap, &nimaps, &free_list); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_bmap_finish(&(tp), &(free_list), + firstfsb, &committed); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto error0; + + if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { + cmn_err(CE_PANIC,"Access to block zero: fs <%s> " + "inode: %lld start_block : %llx start_off : " + "%llx blkcnt : %llx extent-state : %x \n", + (ip->i_mount)->m_fsname, + (long long)ip->i_ino, + imap.br_startblock,imap.br_startoff, + imap.br_blockcount,imap.br_state); + } + + if ((numblks_fsb = imap.br_blockcount) == 0) { + /* + * The numblks_fsb value should always get + * smaller, otherwise the loop is stuck. + */ + ASSERT(imap.br_blockcount); + break; + } + offset_fsb += numblks_fsb; + count_fsb -= numblks_fsb; + } while (count_fsb > 0); + + return 0; + +error_on_bmapi_transaction: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + return XFS_ERROR(error); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h new file mode 100644 index 00000000000..31c91087cb3 --- /dev/null +++ b/fs/xfs/xfs_iomap.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2003,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + + + +#ifndef __XFS_IOMAP_H__ +#define __XFS_IOMAP_H__ + +#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) + + +typedef enum { /* iomap_flags values */ + IOMAP_EOF = 0x01, /* mapping contains EOF */ + IOMAP_HOLE = 0x02, /* mapping covers a hole */ + IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ + IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ + IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ + /* but uninitialized file data */ + IOMAP_NEW = 0x40 /* just allocate */ +} iomap_flags_t; + +typedef enum { + /* base extent manipulation calls */ + BMAPI_READ = (1 << 0), /* read extents */ + BMAPI_WRITE = (1 << 1), /* create extents */ + BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ + BMAPI_UNWRITTEN = (1 << 3), /* unwritten extents to real extents */ + /* modifiers */ + BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ + BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ + BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ + BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ + BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */ + BMAPI_DEVICE = (1 << 9), /* we only want to know the device */ +} bmapi_flags_t; + + +/* + * xfs_iomap_t: File system I/O map + * + * The iomap_bn field is expressed in 512-byte blocks, and is where the + * mapping starts on disk. + * + * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. + * iomap_offset is the offset of the mapping in the file itself. + * iomap_bsize is the size of the mapping, iomap_delta is the + * desired data's offset into the mapping, given the offset supplied + * to the file I/O map routine. + * + * When a request is made to read beyond the logical end of the object, + * iomap_size may be set to 0, but iomap_offset and iomap_length should be set + * to the actual amount of underlying storage that has been allocated, if any. + */ + +typedef struct xfs_iomap { + xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ + xfs_buftarg_t *iomap_target; + loff_t iomap_offset; /* offset of mapping, bytes */ + loff_t iomap_bsize; /* size of mapping, bytes */ + size_t iomap_delta; /* offset into mapping, bytes */ + iomap_flags_t iomap_flags; +} xfs_iomap_t; + +struct xfs_iocore; +struct xfs_inode; +struct xfs_bmbt_irec; + +extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int, + struct xfs_iomap *, int *); +extern int xfs_iomap_write_direct(struct xfs_inode *, loff_t, size_t, + int, struct xfs_bmbt_irec *, int *, int); +extern int xfs_iomap_write_delay(struct xfs_inode *, loff_t, size_t, int, + struct xfs_bmbt_irec *, int *); +extern int xfs_iomap_write_allocate(struct xfs_inode *, + struct xfs_bmbt_irec *, int *); +extern int xfs_iomap_write_unwritten(struct xfs_inode *, loff_t, size_t); + +#endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c new file mode 100644 index 00000000000..8fbc8d37818 --- /dev/null +++ b/fs/xfs/xfs_itable.c @@ -0,0 +1,858 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" + +#ifndef HAVE_USERACC +#define useracc(ubuffer, size, flags, foo) (0) +#define unuseracc(ubuffer, size, flags) +#endif + +STATIC int +xfs_bulkstat_one_iget( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + xfs_daddr_t bno, /* starting bno of inode cluster */ + xfs_bstat_t *buf, /* return buffer */ + int *stat) /* BULKSTAT_RV_... */ +{ + xfs_dinode_core_t *dic; /* dinode core info pointer */ + xfs_inode_t *ip; /* incore inode pointer */ + int error; + + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); + if (error) { + *stat = BULKSTAT_RV_NOTHING; + return error; + } + + ASSERT(ip != NULL); + ASSERT(ip->i_blkno != (xfs_daddr_t)0); + if (ip->i_d.di_mode == 0) { + *stat = BULKSTAT_RV_NOTHING; + error = XFS_ERROR(ENOENT); + goto out_iput; + } + + dic = &ip->i_d; + + /* xfs_iget returns the following without needing + * further change. + */ + buf->bs_nlink = dic->di_nlink; + buf->bs_projid = dic->di_projid; + buf->bs_ino = ino; + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; + buf->bs_size = dic->di_size; + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; + buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; + buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; + buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); + buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; + buf->bs_extents = dic->di_nextents; + buf->bs_gen = dic->di_gen; + memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + buf->bs_dmevmask = dic->di_dmevmask; + buf->bs_dmstate = dic->di_dmstate; + buf->bs_aextents = dic->di_anextents; + + switch (dic->di_format) { + case XFS_DINODE_FMT_DEV: + buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_blksize = BLKDEV_IOSIZE; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; + break; + } + + out_iput: + xfs_iput(ip, XFS_ILOCK_SHARED); + return error; +} + +STATIC int +xfs_bulkstat_one_dinode( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + xfs_dinode_t *dip, /* dinode inode pointer */ + xfs_bstat_t *buf) /* return buffer */ +{ + xfs_dinode_core_t *dic; /* dinode core info pointer */ + + dic = &dip->di_core; + + /* + * The inode format changed when we moved the link count and + * made it 32 bits long. If this is an old format inode, + * convert it in memory to look like a new one. If it gets + * flushed to disk we will convert back before flushing or + * logging it. We zero out the new projid field and the old link + * count field. We'll handle clearing the pad field (the remains + * of the old uuid field) when we actually convert the inode to + * the new format. We don't change the version number so that we + * can distinguish this from a real new format inode. + */ + if (INT_GET(dic->di_version, ARCH_CONVERT) == XFS_DINODE_VERSION_1) { + buf->bs_nlink = INT_GET(dic->di_onlink, ARCH_CONVERT); + buf->bs_projid = 0; + } else { + buf->bs_nlink = INT_GET(dic->di_nlink, ARCH_CONVERT); + buf->bs_projid = INT_GET(dic->di_projid, ARCH_CONVERT); + } + + buf->bs_ino = ino; + buf->bs_mode = INT_GET(dic->di_mode, ARCH_CONVERT); + buf->bs_uid = INT_GET(dic->di_uid, ARCH_CONVERT); + buf->bs_gid = INT_GET(dic->di_gid, ARCH_CONVERT); + buf->bs_size = INT_GET(dic->di_size, ARCH_CONVERT); + buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, ARCH_CONVERT); + buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, ARCH_CONVERT); + buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, ARCH_CONVERT); + buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, ARCH_CONVERT); + buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, ARCH_CONVERT); + buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, ARCH_CONVERT); + buf->bs_xflags = xfs_dic2xflags(dic); + buf->bs_extsize = INT_GET(dic->di_extsize, ARCH_CONVERT) << mp->m_sb.sb_blocklog; + buf->bs_extents = INT_GET(dic->di_nextents, ARCH_CONVERT); + buf->bs_gen = INT_GET(dic->di_gen, ARCH_CONVERT); + memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + buf->bs_dmevmask = INT_GET(dic->di_dmevmask, ARCH_CONVERT); + buf->bs_dmstate = INT_GET(dic->di_dmstate, ARCH_CONVERT); + buf->bs_aextents = INT_GET(dic->di_anextents, ARCH_CONVERT); + + switch (INT_GET(dic->di_format, ARCH_CONVERT)) { + case XFS_DINODE_FMT_DEV: + buf->bs_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); + buf->bs_blksize = BLKDEV_IOSIZE; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = INT_GET(dic->di_nblocks, ARCH_CONVERT); + break; + } + + return 0; +} + +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int /* error status */ +xfs_bulkstat_one( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + void *private_data, /* my private data */ + xfs_daddr_t bno, /* starting bno of inode cluster */ + int *ubused, /* bytes used by me */ + void *dibuff, /* on-disk inode buffer */ + int *stat) /* BULKSTAT_RV_... */ +{ + xfs_bstat_t *buf; /* return buffer */ + int error = 0; /* error value */ + xfs_dinode_t *dip; /* dinode inode pointer */ + + dip = (xfs_dinode_t *)dibuff; + + if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && + (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) { + *stat = BULKSTAT_RV_NOTHING; + return XFS_ERROR(EINVAL); + } + if (ubsize < sizeof(*buf)) { + *stat = BULKSTAT_RV_NOTHING; + return XFS_ERROR(ENOMEM); + } + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP); + + if (dip == NULL) { + /* We're not being passed a pointer to a dinode. This happens + * if BULKSTAT_FG_IGET is selected. Do the iget. + */ + error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat); + if (error) + goto out_free; + } else { + xfs_bulkstat_one_dinode(mp, ino, dip, buf); + } + + if (copy_to_user(buffer, buf, sizeof(*buf))) { + *stat = BULKSTAT_RV_NOTHING; + error = EFAULT; + goto out_free; + } + + *stat = BULKSTAT_RV_DIDONE; + if (ubused) + *ubused = sizeof(*buf); + + out_free: + kmem_free(buf, sizeof(*buf)); + return error; +} + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* last inode returned */ + int *ubcountp, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + void *private_data,/* private data for formatter */ + size_t statstruct_size, /* sizeof struct filling */ + char __user *ubuffer, /* buffer with inode stats */ + int flags, /* defined in xfs_itable.h */ + int *done) /* 1 if there're more stats to get */ +{ + xfs_agblock_t agbno=0;/* allocation group block number */ + xfs_buf_t *agbp; /* agi header buffer */ + xfs_agi_t *agi; /* agi header data */ + xfs_agino_t agino; /* inode # in allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_daddr_t bno; /* inode cluster start daddr */ + int chunkidx; /* current index into inode chunk */ + int clustidx; /* current index into inode cluster */ + xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ + int end_of_ag; /* set if we've seen the ag end */ + int error; /* error code */ + int fmterror;/* bulkstat formatter result */ + __int32_t gcnt; /* current btree rec's count */ + xfs_inofree_t gfree; /* current btree rec's free mask */ + xfs_agino_t gino; /* current btree rec's start inode */ + int i; /* loop index */ + int icount; /* count of inodes good in irbuf */ + xfs_ino_t ino; /* inode number (filesystem) */ + xfs_inobt_rec_t *irbp; /* current irec buffer pointer */ + xfs_inobt_rec_t *irbuf; /* start of irec buffer */ + xfs_inobt_rec_t *irbufend; /* end of good irec buffer entries */ + xfs_ino_t lastino=0; /* last inode number returned */ + int nbcluster; /* # of blocks in a cluster */ + int nicluster; /* # of inodes in a cluster */ + int nimask; /* mask for inode clusters */ + int nirbuf; /* size of irbuf */ + int rval; /* return value error code */ + int tmp; /* result value from btree calls */ + int ubcount; /* size of user's buffer */ + int ubleft; /* bytes left in user's buffer */ + char __user *ubufp; /* pointer into user's buffer */ + int ubelem; /* spaces used in user's buffer */ + int ubused; /* bytes used by formatter */ + xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ + xfs_dinode_t *dip; /* ptr into bp for specific inode */ + xfs_inode_t *ip; /* ptr to in-core inode struct */ + + /* + * Get the last inode value, see if there's nothing to do. + */ + ino = (xfs_ino_t)*lastinop; + dip = NULL; + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + if (agno >= mp->m_sb.sb_agcount || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + *done = 1; + *ubcountp = 0; + return 0; + } + ubcount = *ubcountp; /* statstruct's */ + ubleft = ubcount * statstruct_size; /* bytes */ + *ubcountp = ubelem = 0; + *done = 0; + fmterror = 0; + ubufp = ubuffer; + nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? + mp->m_sb.sb_inopblock : + (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); + nimask = ~(nicluster - 1); + nbcluster = nicluster >> mp->m_sb.sb_inopblog; + /* + * Lock down the user's buffer. If a buffer was not sent, as in the case + * disk quota code calls here, we skip this. + */ + if (ubuffer && + (error = useracc(ubuffer, ubcount * statstruct_size, + (B_READ|B_PHYS), NULL))) { + return error; + } + /* + * Allocate a page-sized buffer for inode btree records. + * We could try allocating something smaller, but for normal + * calls we'll always (potentially) need the whole page. + */ + irbuf = kmem_alloc(NBPC, KM_SLEEP); + nirbuf = NBPC / sizeof(*irbuf); + /* + * Loop over the allocation groups, starting from the last + * inode returned; 0 means start of the allocation group. + */ + rval = 0; + while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) { + bp = NULL; + down_read(&mp->m_peraglock); + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + up_read(&mp->m_peraglock); + if (error) { + /* + * Skip this allocation group and go to the next one. + */ + agno++; + agino = 0; + continue; + } + agi = XFS_BUF_TO_AGI(agbp); + /* + * Allocate and initialize a btree cursor for ialloc btree. + */ + cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, + (xfs_inode_t *)0, 0); + irbp = irbuf; + irbufend = irbuf + nirbuf; + end_of_ag = 0; + /* + * If we're returning in the middle of an allocation group, + * we need to get the remainder of the chunk we're in. + */ + if (agino > 0) { + /* + * Lookup the inode chunk that this inode lives in. + */ + error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); + if (!error && /* no I/O error */ + tmp && /* lookup succeeded */ + /* got the record, should always work */ + !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, + &gfree, &i)) && + i == 1 && + /* this is the right chunk */ + agino < gino + XFS_INODES_PER_CHUNK && + /* lastino was not last in chunk */ + (chunkidx = agino - gino + 1) < + XFS_INODES_PER_CHUNK && + /* there are some left allocated */ + XFS_INOBT_MASKN(chunkidx, + XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { + /* + * Grab the chunk record. Mark all the + * uninteresting inodes (because they're + * before our start point) free. + */ + for (i = 0; i < chunkidx; i++) { + if (XFS_INOBT_MASK(i) & ~gfree) + gcnt++; + } + gfree |= XFS_INOBT_MASKN(0, chunkidx); + INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); + INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); + INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + irbp++; + agino = gino + XFS_INODES_PER_CHUNK; + icount = XFS_INODES_PER_CHUNK - gcnt; + } else { + /* + * If any of those tests failed, bump the + * inode number (just in case). + */ + agino++; + icount = 0; + } + /* + * In any case, increment to the next record. + */ + if (!error) + error = xfs_inobt_increment(cur, 0, &tmp); + } else { + /* + * Start of ag. Lookup the first inode chunk. + */ + error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); + icount = 0; + } + /* + * Loop through inode btree records in this ag, + * until we run out of inodes or space in the buffer. + */ + while (irbp < irbufend && icount < ubcount) { + /* + * Loop as long as we're unable to read the + * inode btree. + */ + while (error) { + agino += XFS_INODES_PER_CHUNK; + if (XFS_AGINO_TO_AGBNO(mp, agino) >= + INT_GET(agi->agi_length, ARCH_CONVERT)) + break; + error = xfs_inobt_lookup_ge(cur, agino, 0, 0, + &tmp); + } + /* + * If ran off the end of the ag either with an error, + * or the normal way, set end and stop collecting. + */ + if (error || + (error = xfs_inobt_get_rec(cur, &gino, &gcnt, + &gfree, &i)) || + i == 0) { + end_of_ag = 1; + break; + } + /* + * If this chunk has any allocated inodes, save it. + */ + if (gcnt < XFS_INODES_PER_CHUNK) { + INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); + INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); + INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + irbp++; + icount += XFS_INODES_PER_CHUNK - gcnt; + } + /* + * Set agino to after this chunk and bump the cursor. + */ + agino = gino + XFS_INODES_PER_CHUNK; + error = xfs_inobt_increment(cur, 0, &tmp); + } + /* + * Drop the btree buffers and the agi buffer. + * We can't hold any of the locks these represent + * when calling iget. + */ + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + /* + * Now format all the good inodes into the user's buffer. + */ + irbufend = irbp; + for (irbp = irbuf; + irbp < irbufend && ubleft >= statstruct_size; irbp++) { + /* + * Read-ahead the next chunk's worth of inodes. + */ + if (&irbp[1] < irbufend) { + /* + * Loop over all clusters in the next chunk. + * Do a readahead if there are any allocated + * inodes in that cluster. + */ + for (agbno = XFS_AGINO_TO_AGBNO(mp, + INT_GET(irbp[1].ir_startino, ARCH_CONVERT)), + chunkidx = 0; + chunkidx < XFS_INODES_PER_CHUNK; + chunkidx += nicluster, + agbno += nbcluster) { + if (XFS_INOBT_MASKN(chunkidx, + nicluster) & + ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT))) + xfs_btree_reada_bufs(mp, agno, + agbno, nbcluster); + } + } + /* + * Now process this chunk of inodes. + */ + for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0; + ubleft > 0 && + INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK; + chunkidx++, clustidx++, agino++) { + ASSERT(chunkidx < XFS_INODES_PER_CHUNK); + /* + * Recompute agbno if this is the + * first inode of the cluster. + * + * Careful with clustidx. There can be + * multple clusters per chunk, a single + * cluster per chunk or a cluster that has + * inodes represented from several different + * chunks (if blocksize is large). + * + * Because of this, the starting clustidx is + * initialized to zero in this loop but must + * later be reset after reading in the cluster + * buffer. + */ + if ((chunkidx & (nicluster - 1)) == 0) { + agbno = XFS_AGINO_TO_AGBNO(mp, + INT_GET(irbp->ir_startino, ARCH_CONVERT)) + + ((chunkidx & nimask) >> + mp->m_sb.sb_inopblog); + + if (flags & BULKSTAT_FG_QUICK) { + ino = XFS_AGINO_TO_INO(mp, agno, + agino); + bno = XFS_AGB_TO_DADDR(mp, agno, + agbno); + + /* + * Get the inode cluster buffer + */ + ASSERT(xfs_inode_zone != NULL); + ip = kmem_zone_zalloc(xfs_inode_zone, + KM_SLEEP); + ip->i_ino = ino; + ip->i_mount = mp; + if (bp) + xfs_buf_relse(bp); + error = xfs_itobp(mp, NULL, ip, + &dip, &bp, bno); + if (!error) + clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; + kmem_zone_free(xfs_inode_zone, ip); + if (XFS_TEST_ERROR(error != 0, + mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, + XFS_RANDOM_BULKSTAT_READ_CHUNK)) { + bp = NULL; + break; + } + } + } + /* + * Skip if this inode is free. + */ + if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT)) + continue; + /* + * Count used inodes as free so we can tell + * when the chunk is used up. + */ + INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1); + ino = XFS_AGINO_TO_INO(mp, agno, agino); + bno = XFS_AGB_TO_DADDR(mp, agno, agbno); + if (flags & BULKSTAT_FG_QUICK) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (clustidx << mp->m_sb.sb_inodelog)); + + if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) + != XFS_DINODE_MAGIC + || !XFS_DINODE_GOOD_VERSION( + INT_GET(dip->di_core.di_version, ARCH_CONVERT))) + continue; + } + + /* + * Get the inode and fill in a single buffer. + * BULKSTAT_FG_QUICK uses dip to fill it in. + * BULKSTAT_FG_IGET uses igets. + * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. + * This is also used to count inodes/blks, etc + * in xfs_qm_quotacheck. + */ + ubused = statstruct_size; + error = formatter(mp, ino, ubufp, + ubleft, private_data, + bno, &ubused, dip, &fmterror); + if (fmterror == BULKSTAT_RV_NOTHING) { + if (error == ENOMEM) + ubleft = 0; + continue; + } + if (fmterror == BULKSTAT_RV_GIVEUP) { + ubleft = 0; + ASSERT(error); + rval = error; + break; + } + if (ubufp) + ubufp += ubused; + ubleft -= ubused; + ubelem++; + lastino = ino; + } + } + + if (bp) + xfs_buf_relse(bp); + + /* + * Set up for the next loop iteration. + */ + if (ubleft > 0) { + if (end_of_ag) { + agno++; + agino = 0; + } else + agino = XFS_INO_TO_AGINO(mp, lastino); + } else + break; + } + /* + * Done, we're either out of filesystem or space to put the data. + */ + kmem_free(irbuf, NBPC); + if (ubuffer) + unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS)); + *ubcountp = ubelem; + if (agno >= mp->m_sb.sb_agcount) { + /* + * If we ran out of filesystem, mark lastino as off + * the end of the filesystem, so the next call + * will return immediately. + */ + *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); + *done = 1; + } else + *lastinop = (xfs_ino_t)lastino; + + return rval; +} + +/* + * Return stat information in bulk (by-inode) for the filesystem. + * Special case for non-sequential one inode bulkstat. + */ +int /* error status */ +xfs_bulkstat_single( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* inode to return */ + char __user *buffer, /* buffer with inode stats */ + int *done) /* 1 if there're more stats to get */ +{ + int count; /* count value for bulkstat call */ + int error; /* return value */ + xfs_ino_t ino; /* filesystem inode number */ + int res; /* result from bs1 */ + + /* + * note that requesting valid inode numbers which are not allocated + * to inodes will most likely cause xfs_itobp to generate warning + * messages about bad magic numbers. This is ok. The fact that + * the inode isn't actually an inode is handled by the + * error check below. Done this way to make the usual case faster + * at the expense of the error case. + */ + + ino = (xfs_ino_t)*lastinop; + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), + NULL, 0, NULL, NULL, &res); + if (error) { + /* + * Special case way failed, do it the "long" way + * to see if that works. + */ + (*lastinop)--; + count = 1; + if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, + NULL, sizeof(xfs_bstat_t), buffer, + BULKSTAT_FG_IGET, done)) + return error; + if (count == 0 || (xfs_ino_t)*lastinop != ino) + return error == EFSCORRUPTED ? + XFS_ERROR(EINVAL) : error; + else + return 0; + } + *done = 0; + return 0; +} + +/* + * Return inode number table for the filesystem. + */ +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */ +{ + xfs_buf_t *agbp; + xfs_agino_t agino; + xfs_agnumber_t agno; + int bcount; + xfs_inogrp_t *buffer; + int bufidx; + xfs_btree_cur_t *cur; + int error; + __int32_t gcnt; + xfs_inofree_t gfree; + xfs_agino_t gino; + int i; + xfs_ino_t ino; + int left; + int tmp; + + ino = (xfs_ino_t)*lastino; + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + left = *count; + *count = 0; + bcount = MIN(left, (int)(NBPP / sizeof(*buffer))); + buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + error = bufidx = 0; + cur = NULL; + agbp = NULL; + while (left > 0 && agno < mp->m_sb.sb_agcount) { + if (agbp == NULL) { + down_read(&mp->m_peraglock); + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + up_read(&mp->m_peraglock); + if (error) { + /* + * If we can't read the AGI of this ag, + * then just skip to the next one. + */ + ASSERT(cur == NULL); + agbp = NULL; + agno++; + agino = 0; + continue; + } + cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + /* + * Move up the the last inode in the current + * chunk. The lookup_ge will always get + * us the first inode in the next chunk. + */ + agino += XFS_INODES_PER_CHUNK - 1; + continue; + } + } + if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, + &i)) || + i == 0) { + xfs_buf_relse(agbp); + agbp = NULL; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = NULL; + agno++; + agino = 0; + continue; + } + agino = gino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); + buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; + buffer[bufidx].xi_allocmask = ~gfree; + bufidx++; + left--; + if (bufidx == bcount) { + if (copy_to_user(ubuffer, buffer, + bufidx * sizeof(*buffer))) { + error = XFS_ERROR(EFAULT); + break; + } + ubuffer += bufidx; + *count += bufidx; + bufidx = 0; + } + if (left) { + error = xfs_inobt_increment(cur, 0, &tmp); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + /* + * The agino value has already been bumped. + * Just try to skip up to it. + */ + agino += XFS_INODES_PER_CHUNK; + continue; + } + } + } + if (!error) { + if (bufidx) { + if (copy_to_user(ubuffer, buffer, + bufidx * sizeof(*buffer))) + error = XFS_ERROR(EFAULT); + else + *count += bufidx; + } + *lastino = XFS_AGINO_TO_INO(mp, agno, agino); + } + kmem_free(buffer, bcount * sizeof(*buffer)); + if (cur) + xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR)); + if (agbp) + xfs_buf_relse(agbp); + return error; +} diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h new file mode 100644 index 00000000000..2be9d1805ab --- /dev/null +++ b/fs/xfs/xfs_itable.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_ITABLE_H__ +#define __XFS_ITABLE_H__ + +/* + * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat + * structures (by the dmi library). This is a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c + */ +typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + void *private_data, + xfs_daddr_t bno, + int *ubused, + void *dip, + int *stat); + +/* + * Values for stat return value. + */ +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 + +/* + * Values for bulkstat flag argument. + */ +#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ +#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_FG_VFSLOCKED 0x4 /* Already have vfs lock */ + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + void *private_data, /* private data for formatter */ + size_t statstruct_size,/* sizeof struct that we're filling */ + char __user *ubuffer,/* buffer with inode stats */ + int flags, /* flag to control access method */ + int *done); /* 1 if there're more stats to get */ + +int +xfs_bulkstat_single( + xfs_mount_t *mp, + xfs_ino_t *lastinop, + char __user *buffer, + int *done); + +int +xfs_bulkstat_one( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + void *private_data, + xfs_daddr_t bno, + int *ubused, + void *dibuff, + int *stat); + +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *last, /* last inode returned */ + int *count, /* size of buffer/count returned */ + xfs_inogrp_t __user *buffer);/* buffer with inode info */ + +#endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c new file mode 100644 index 00000000000..092d5fb096b --- /dev/null +++ b/fs/xfs/xfs_log.c @@ -0,0 +1,3560 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * High level interface routines for log manager + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_alloc_btree.h" +#include "xfs_log_recover.h" +#include "xfs_bit.h" +#include "xfs_rw.h" +#include "xfs_trans_priv.h" + + +#define xlog_write_adv_cnt(ptr, len, off, bytes) \ + { (ptr) += (bytes); \ + (len) -= (bytes); \ + (off) += (bytes);} + +/* Local miscellaneous function prototypes */ +STATIC int xlog_bdstrat_cb(struct xfs_buf *); +STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, + xlog_in_core_t **, xfs_lsn_t *); +STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks); +STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); +STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); +STATIC void xlog_unalloc_log(xlog_t *log); +STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], + int nentries, xfs_log_ticket_t tic, + xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, + uint flags); + +/* local state machine functions */ +STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); +STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); +STATIC int xlog_state_get_iclog_space(xlog_t *log, + int len, + xlog_in_core_t **iclog, + xlog_ticket_t *ticket, + int *continued_write, + int *logoffsetp); +STATIC void xlog_state_put_ticket(xlog_t *log, + xlog_ticket_t *tic); +STATIC int xlog_state_release_iclog(xlog_t *log, + xlog_in_core_t *iclog); +STATIC void xlog_state_switch_iclogs(xlog_t *log, + xlog_in_core_t *iclog, + int eventual_size); +STATIC int xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags); +STATIC int xlog_state_sync_all(xlog_t *log, uint flags); +STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); + +/* local functions to manipulate grant head */ +STATIC int xlog_grant_log_space(xlog_t *log, + xlog_ticket_t *xtic); +STATIC void xlog_grant_push_ail(xfs_mount_t *mp, + int need_bytes); +STATIC void xlog_regrant_reserve_log_space(xlog_t *log, + xlog_ticket_t *ticket); +STATIC int xlog_regrant_write_log_space(xlog_t *log, + xlog_ticket_t *ticket); +STATIC void xlog_ungrant_log_space(xlog_t *log, + xlog_ticket_t *ticket); + + +/* local ticket functions */ +STATIC void xlog_state_ticket_alloc(xlog_t *log); +STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, + int unit_bytes, + int count, + char clientid, + uint flags); +STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket); + +/* local debug functions */ +#if defined(DEBUG) && !defined(XLOG_NOLOG) +STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); +STATIC void xlog_verify_grant_head(xlog_t *log, int equals); +STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, + int count, boolean_t syncing); +STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, + xfs_lsn_t tail_lsn); +#else +#define xlog_verify_dest_ptr(a,b) +#define xlog_verify_grant_head(a,b) +#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_tail_lsn(a,b,c) +#endif + +int xlog_iclogs_empty(xlog_t *log); + +#ifdef DEBUG +int xlog_do_error = 0; +int xlog_req_num = 0; +int xlog_error_mod = 33; +#endif + +#define XLOG_FORCED_SHUTDOWN(log) (log->l_flags & XLOG_IO_ERROR) + +/* + * 0 => disable log manager + * 1 => enable log manager + * 2 => enable log manager and log debugging + */ +#if defined(XLOG_NOLOG) || defined(DEBUG) +int xlog_debug = 1; +xfs_buftarg_t *xlog_target; +#endif + +#if defined(XFS_LOG_TRACE) + +void +xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) +{ + if (! log->l_grant_trace) { + log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP); + if (! log->l_grant_trace) + return; + } + + ktrace_enter(log->l_grant_trace, + (void *)tic, + (void *)log->l_reserve_headq, + (void *)log->l_write_headq, + (void *)((unsigned long)log->l_grant_reserve_cycle), + (void *)((unsigned long)log->l_grant_reserve_bytes), + (void *)((unsigned long)log->l_grant_write_cycle), + (void *)((unsigned long)log->l_grant_write_bytes), + (void *)((unsigned long)log->l_curr_cycle), + (void *)((unsigned long)log->l_curr_block), + (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)), + (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)), + (void *)string, + (void *)((unsigned long)13), + (void *)((unsigned long)14), + (void *)((unsigned long)15), + (void *)((unsigned long)16)); +} + +void +xlog_trace_iclog(xlog_in_core_t *iclog, uint state) +{ + pid_t pid; + + pid = current_pid(); + + if (!iclog->ic_trace) + iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); + ktrace_enter(iclog->ic_trace, + (void *)((unsigned long)state), + (void *)((unsigned long)pid), + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0, + (void *)0); +} + +#else +#define xlog_trace_loggrant(log,tic,string) +#define xlog_trace_iclog(iclog,state) +#endif /* XFS_LOG_TRACE */ + +/* + * NOTES: + * + * 1. currblock field gets updated at startup and after in-core logs + * marked as with WANT_SYNC. + */ + +/* + * This routine is called when a user of a log manager ticket is done with + * the reservation. If the ticket was ever used, then a commit record for + * the associated transaction is written out as a log operation header with + * no data. The flag XLOG_TIC_INITED is set when the first write occurs with + * a given ticket. If the ticket was one with a permanent reservation, then + * a few operations are done differently. Permanent reservation tickets by + * default don't release the reservation. They just commit the current + * transaction with the belief that the reservation is still needed. A flag + * must be passed in before permanent reservations are actually released. + * When these type of tickets are not released, they need to be set into + * the inited state again. By doing this, a start record will be written + * out when the next write occurs. + */ +xfs_lsn_t +xfs_log_done(xfs_mount_t *mp, + xfs_log_ticket_t xtic, + void **iclog, + uint flags) +{ + xlog_t *log = mp->m_log; + xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; + xfs_lsn_t lsn = 0; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return 0; +#endif + + if (XLOG_FORCED_SHUTDOWN(log) || + /* + * If nothing was ever written, don't write out commit record. + * If we get an error, just continue and give back the log ticket. + */ + (((ticket->t_flags & XLOG_TIC_INITED) == 0) && + (xlog_commit_record(mp, ticket, + (xlog_in_core_t **)iclog, &lsn)))) { + lsn = (xfs_lsn_t) -1; + if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { + flags |= XFS_LOG_REL_PERM_RESERV; + } + } + + + if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || + (flags & XFS_LOG_REL_PERM_RESERV)) { + /* + * Release ticket if not permanent reservation or a specifc + * request has been made to release a permanent reservation. + */ + xlog_ungrant_log_space(log, ticket); + xlog_state_put_ticket(log, ticket); + } else { + xlog_regrant_reserve_log_space(log, ticket); + } + + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ + if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && + (flags & XFS_LOG_REL_PERM_RESERV) == 0) + ticket->t_flags |= XLOG_TIC_INITED; + + return lsn; +} /* xfs_log_done */ + + +/* + * Force the in-core log to disk. If flags == XFS_LOG_SYNC, + * the force is done synchronously. + * + * Asynchronous forces are implemented by setting the WANT_SYNC + * bit in the appropriate in-core log and then returning. + * + * Synchronous forces are implemented with a semaphore. All callers + * to force a given lsn to disk will wait on a semaphore attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * semaphore. + */ +int +xfs_log_force(xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int rval; + xlog_t *log = mp->m_log; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return 0; +#endif + + ASSERT(flags & XFS_LOG_FORCE); + + XFS_STATS_INC(xs_log_force); + + if ((log->l_flags & XLOG_IO_ERROR) == 0) { + if (lsn == 0) + rval = xlog_state_sync_all(log, flags); + else + rval = xlog_state_sync(log, lsn, flags); + } else { + rval = XFS_ERROR(EIO); + } + + return rval; + +} /* xfs_log_force */ + +/* + * Attaches a new iclog I/O completion callback routine during + * transaction commit. If the log is in error state, a non-zero + * return code is handed back and the caller is responsible for + * executing the callback at an appropriate time. + */ +int +xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ + void *iclog_hndl, /* iclog to hang callback off */ + xfs_log_callback_t *cb) +{ + xlog_t *log = mp->m_log; + xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; + int abortflg, spl; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return 0; +#endif + cb->cb_next = NULL; + spl = LOG_LOCK(log); + abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); + if (!abortflg) { + ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || + (iclog->ic_state == XLOG_STATE_WANT_SYNC)); + cb->cb_next = NULL; + *(iclog->ic_callback_tail) = cb; + iclog->ic_callback_tail = &(cb->cb_next); + } + LOG_UNLOCK(log, spl); + return abortflg; +} /* xfs_log_notify */ + +int +xfs_log_release_iclog(xfs_mount_t *mp, + void *iclog_hndl) +{ + xlog_t *log = mp->m_log; + xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; + + if (xlog_state_release_iclog(log, iclog)) { + xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); + return(EIO); + } + + return 0; +} + +/* + * 1. Reserve an amount of on-disk log space and return a ticket corresponding + * to the reservation. + * 2. Potentially, push buffers at tail of log to disk. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve(xfs_mount_t *mp, + int unit_bytes, + int cnt, + xfs_log_ticket_t *ticket, + __uint8_t client, + uint flags) +{ + xlog_t *log = mp->m_log; + xlog_ticket_t *internal_ticket; + int retval; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return 0; +#endif + retval = 0; + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + ASSERT((flags & XFS_LOG_NOSLEEP) == 0); + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + if (*ticket != NULL) { + ASSERT(flags & XFS_LOG_PERM_RESERV); + internal_ticket = (xlog_ticket_t *)*ticket; + xlog_grant_push_ail(mp, internal_ticket->t_unit_res); + retval = xlog_regrant_write_log_space(log, internal_ticket); + } else { + /* may sleep if need to allocate more tickets */ + internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, + client, flags); + *ticket = internal_ticket; + xlog_grant_push_ail(mp, + (internal_ticket->t_unit_res * + internal_ticket->t_cnt)); + retval = xlog_grant_log_space(log, internal_ticket); + } + + return retval; +} /* xfs_log_reserve */ + + +/* + * Mount a log filesystem + * + * mp - ubiquitous xfs mount point structure + * log_target - buftarg of on-disk log device + * blk_offset - Start block # where block size is 512 bytes (BBSIZE) + * num_bblocks - Number of BBSIZE blocks in on-disk log + * + * Return error or zero. + */ +int +xfs_log_mount(xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); + else { + cmn_err(CE_NOTE, + "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", + mp->m_fsname); + ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + } + + mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug) { + cmn_err(CE_NOTE, "log dev: %s", XFS_BUFTARG_NAME(log_target)); + return 0; + } +#endif + /* + * skip log recovery on a norecovery mount. pretend it all + * just worked. + */ + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + int error; + vfs_t *vfsp = XFS_MTOVFS(mp); + int readonly = (vfsp->vfs_flag & VFS_RDONLY); + + if (readonly) + vfsp->vfs_flag &= ~VFS_RDONLY; + + error = xlog_recover(mp->m_log, readonly); + + if (readonly) + vfsp->vfs_flag |= VFS_RDONLY; + if (error) { + cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); + xlog_unalloc_log(mp->m_log); + return error; + } + } + + /* Normal transactions can now occur */ + mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + + /* End mounting message in xfs_log_mount_finish */ + return 0; +} /* xfs_log_mount */ + +/* + * Finish the recovery of the file system. This is separate from + * the xfs_log_mount() call, because it depends on the code in + * xfs_mountfs() to read in the root and real-time bitmap inodes + * between calling xfs_log_mount() and here. + * + * mp - ubiquitous xfs mount point structure + */ +int +xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) +{ + int error; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + error = xlog_recover_finish(mp->m_log, mfsi_flags); + else { + error = 0; + ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + } + + return error; +} + +/* + * Unmount processing for the log. + */ +int +xfs_log_unmount(xfs_mount_t *mp) +{ + int error; + + error = xfs_log_unmount_write(mp); + xfs_log_unmount_dealloc(mp); + return (error); +} + +/* + * Final log writes as part of unmount. + * + * Mark the filesystem clean as unmount happens. Note that during relocation + * this routine needs to be executed as part of source-bag while the + * deallocation must not be done until source-end. + */ + +/* + * Unmount record used to have a string "Unmount filesystem--" in the + * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). + * We just write the magic number now since that particular field isn't + * currently architecture converted and "nUmount" is a bit foo. + * As far as I know, there weren't any dependencies on the old behaviour. + */ + +int +xfs_log_unmount_write(xfs_mount_t *mp) +{ + xlog_t *log = mp->m_log; + xlog_in_core_t *iclog; +#ifdef DEBUG + xlog_in_core_t *first_iclog; +#endif + xfs_log_iovec_t reg[1]; + xfs_log_ticket_t tic = NULL; + xfs_lsn_t lsn; + int error; + SPLDECL(s); + + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return 0; +#endif + + /* + * Don't write out unmount record on read-only mounts. + * Or, if we are doing a forced umount (typically because of IO errors). + */ + if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + return 0; + + xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); + +#ifdef DEBUG + first_iclog = iclog = log->l_iclog; + do { + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } + iclog = iclog->ic_next; + } while (iclog != first_iclog); +#endif + if (! (XLOG_FORCED_SHUTDOWN(log))) { + reg[0].i_addr = (void*)&magic; + reg[0].i_len = sizeof(magic); + + error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); + if (!error) { + /* remove inited flag */ + ((xlog_ticket_t *)tic)->t_flags = 0; + error = xlog_write(mp, reg, 1, tic, &lsn, + NULL, XLOG_UNMOUNT_TRANS); + /* + * At this point, we're umounting anyway, + * so there's no point in transitioning log state + * to IOERROR. Just continue... + */ + } + + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_log_unmount: unmount record failed"); + } + + + s = LOG_LOCK(log); + iclog = log->l_iclog; + iclog->ic_refcnt++; + LOG_UNLOCK(log, s); + xlog_state_want_sync(log, iclog); + (void) xlog_state_release_iclog(log, iclog); + + s = LOG_LOCK(log); + if (!(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY)) { + if (!XLOG_FORCED_SHUTDOWN(log)) { + sv_wait(&iclog->ic_forcesema, PMEM, + &log->l_icloglock, s); + } else { + LOG_UNLOCK(log, s); + } + } else { + LOG_UNLOCK(log, s); + } + if (tic) + xlog_state_put_ticket(log, tic); + } else { + /* + * We're already in forced_shutdown mode, couldn't + * even attempt to write out the unmount transaction. + * + * Go through the motions of sync'ing and releasing + * the iclog, even though no I/O will actually happen, + * we need to wait for other log I/O's that may already + * be in progress. Do this as a separate section of + * code so we'll know if we ever get stuck here that + * we're in this odd situation of trying to unmount + * a file system that went into forced_shutdown as + * the result of an unmount.. + */ + s = LOG_LOCK(log); + iclog = log->l_iclog; + iclog->ic_refcnt++; + LOG_UNLOCK(log, s); + + xlog_state_want_sync(log, iclog); + (void) xlog_state_release_iclog(log, iclog); + + s = LOG_LOCK(log); + + if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE + || iclog->ic_state == XLOG_STATE_DIRTY + || iclog->ic_state == XLOG_STATE_IOERROR) ) { + + sv_wait(&iclog->ic_forcesema, PMEM, + &log->l_icloglock, s); + } else { + LOG_UNLOCK(log, s); + } + } + + return 0; +} /* xfs_log_unmount_write */ + +/* + * Deallocate log structures for unmount/relocation. + */ +void +xfs_log_unmount_dealloc(xfs_mount_t *mp) +{ + xlog_unalloc_log(mp->m_log); +} + +/* + * Write region vectors to log. The write happens using the space reservation + * of the ticket (tic). It is not a requirement that all writes for a given + * transaction occur with one call to xfs_log_write(). + */ +int +xfs_log_write(xfs_mount_t * mp, + xfs_log_iovec_t reg[], + int nentries, + xfs_log_ticket_t tic, + xfs_lsn_t *start_lsn) +{ + int error; + xlog_t *log = mp->m_log; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) { + *start_lsn = 0; + return 0; + } +#endif + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { + xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); + } + return (error); +} /* xfs_log_write */ + + +void +xfs_log_move_tail(xfs_mount_t *mp, + xfs_lsn_t tail_lsn) +{ + xlog_ticket_t *tic; + xlog_t *log = mp->m_log; + int need_bytes, free_bytes, cycle, bytes; + SPLDECL(s); + +#if defined(DEBUG) || defined(XLOG_NOLOG) + if (!xlog_debug && xlog_target == log->l_targ) + return; +#endif + /* XXXsup tmp */ + if (XLOG_FORCED_SHUTDOWN(log)) + return; + ASSERT(!XFS_FORCED_SHUTDOWN(mp)); + + if (tail_lsn == 0) { + /* needed since sync_lsn is 64 bits */ + s = LOG_LOCK(log); + tail_lsn = log->l_last_sync_lsn; + LOG_UNLOCK(log, s); + } + + s = GRANT_LOCK(log); + + /* Also an invalid lsn. 1 implies that we aren't passing in a valid + * tail_lsn. + */ + if (tail_lsn != 1) { + log->l_tail_lsn = tail_lsn; + } + + if ((tic = log->l_write_headq)) { +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("Recovery problem"); +#endif + cycle = log->l_grant_write_cycle; + bytes = log->l_grant_write_bytes; + free_bytes = xlog_space_left(log, cycle, bytes); + do { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + + if (free_bytes < tic->t_unit_res && tail_lsn != 1) + break; + tail_lsn = 0; + free_bytes -= tic->t_unit_res; + sv_signal(&tic->t_sema); + tic = tic->t_next; + } while (tic != log->l_write_headq); + } + if ((tic = log->l_reserve_headq)) { +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("Recovery problem"); +#endif + cycle = log->l_grant_reserve_cycle; + bytes = log->l_grant_reserve_bytes; + free_bytes = xlog_space_left(log, cycle, bytes); + do { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + need_bytes = tic->t_unit_res*tic->t_cnt; + else + need_bytes = tic->t_unit_res; + if (free_bytes < need_bytes && tail_lsn != 1) + break; + tail_lsn = 0; + free_bytes -= need_bytes; + sv_signal(&tic->t_sema); + tic = tic->t_next; + } while (tic != log->l_reserve_headq); + } + GRANT_UNLOCK(log, s); +} /* xfs_log_move_tail */ + +/* + * Determine if we have a transaction that has gone to disk + * that needs to be covered. Log activity needs to be idle (no AIL and + * nothing in the iclogs). And, we need to be in the right state indicating + * something has gone out. + */ +int +xfs_log_need_covered(xfs_mount_t *mp) +{ + SPLDECL(s); + int needed = 0, gen; + xlog_t *log = mp->m_log; + vfs_t *vfsp = XFS_MTOVFS(mp); + + if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) || + (vfsp->vfs_flag & VFS_RDONLY)) + return 0; + + s = LOG_LOCK(log); + if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || + (log->l_covered_state == XLOG_STATE_COVER_NEED2)) + && !xfs_trans_first_ail(mp, &gen) + && xlog_iclogs_empty(log)) { + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else { + ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); + log->l_covered_state = XLOG_STATE_COVER_DONE2; + } + needed = 1; + } + LOG_UNLOCK(log, s); + return(needed); +} + +/****************************************************************************** + * + * local routines + * + ****************************************************************************** + */ + +/* xfs_trans_tail_ail returns 0 when there is nothing in the list. + * The log manager must keep track of the last LR which was committed + * to disk. The lsn of this LR will become the new tail_lsn whenever + * xfs_trans_tail_ail returns 0. If we don't do this, we run into + * the situation where stuff could be written into the log but nothing + * was ever in the AIL when asked. Eventually, we panic since the + * tail hits the head. + * + * We may be holding the log iclog lock upon entering this routine. + */ +xfs_lsn_t +xlog_assign_tail_lsn(xfs_mount_t *mp) +{ + xfs_lsn_t tail_lsn; + SPLDECL(s); + xlog_t *log = mp->m_log; + + tail_lsn = xfs_trans_tail_ail(mp); + s = GRANT_LOCK(log); + if (tail_lsn != 0) { + log->l_tail_lsn = tail_lsn; + } else { + tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; + } + GRANT_UNLOCK(log, s); + + return tail_lsn; +} /* xlog_assign_tail_lsn */ + + +/* + * Return the space in the log between the tail and the head. The head + * is passed in the cycle/bytes formal parms. In the special case where + * the reserve head has wrapped passed the tail, this calculation is no + * longer valid. In this case, just return 0 which means there is no space + * in the log. This works for all places where this function is called + * with the reserve head. Of course, if the write head were to ever + * wrap the tail, we should blow up. Rather than catch this case here, + * we depend on other ASSERTions in other parts of the code. XXXmiken + * + * This code also handles the case where the reservation head is behind + * the tail. The details of this case are described below, but the end + * result is that we return the size of the log as the amount of space left. + */ +int +xlog_space_left(xlog_t *log, int cycle, int bytes) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + + tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); + tail_cycle = CYCLE_LSN(log->l_tail_lsn); + if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { + free_bytes = log->l_logsize - (bytes - tail_bytes); + } else if ((tail_cycle + 1) < cycle) { + return 0; + } else if (tail_cycle < cycle) { + ASSERT(tail_cycle == (cycle - 1)); + free_bytes = tail_bytes - bytes; + } else { + /* + * The reservation head is behind the tail. + * In this case we just want to return the size of the + * log as the amount of space left. + */ + xfs_fs_cmn_err(CE_ALERT, log->l_mp, + "xlog_space_left: head behind tail\n" + " tail_cycle = %d, tail_bytes = %d\n" + " GH cycle = %d, GH bytes = %d", + tail_cycle, tail_bytes, cycle, bytes); + ASSERT(0); + free_bytes = log->l_logsize; + } + return free_bytes; +} /* xlog_space_left */ + + +/* + * Log function which is called when an io completes. + * + * The log manager needs its own routine, in order to control what + * happens with the buffer after the write completes. + */ +void +xlog_iodone(xfs_buf_t *bp) +{ + xlog_in_core_t *iclog; + xlog_t *l; + int aborted; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + aborted = 0; + + /* + * Some versions of cpp barf on the recursive definition of + * ic_log -> hic_fields.ic_log and expand ic_log twice when + * it is passed through two macros. Workaround broken cpp. + */ + l = iclog->ic_log; + + /* + * Race to shutdown the filesystem if we see an error. + */ + if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); + XFS_BUF_STALE(bp); + xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR); + /* + * This flag will be propagated to the trans-committed + * callback routines to let them know that the log-commit + * didn't succeed. + */ + aborted = XFS_LI_ABORTED; + } else if (iclog->ic_state & XLOG_STATE_IOERROR) { + aborted = XFS_LI_ABORTED; + } + xlog_state_done_syncing(iclog, aborted); + if (!(XFS_BUF_ISASYNC(bp))) { + /* + * Corresponding psema() will be done in bwrite(). If we don't + * vsema() here, panic. + */ + XFS_BUF_V_IODONESEMA(bp); + } +} /* xlog_iodone */ + +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + */ +STATIC int +xlog_bdstrat_cb(struct xfs_buf *bp) +{ + xlog_in_core_t *iclog; + + iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); + + if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) { + /* note for irix bstrat will need struct bdevsw passed + * Fix the following macro if the code ever is merged + */ + XFS_bdstrat(bp); + return 0; + } + + xfs_buftrace("XLOG__BDSTRAT IOERROR", bp); + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_STALE(bp); + xfs_biodone(bp); + return (XFS_ERROR(EIO)); + + +} + +/* + * Return size of each in-core log record buffer. + * + * Low memory machines only get 2 16KB buffers. We don't want to waste + * memory here. However, all other machines get at least 2 32KB buffers. + * The number is hard coded because we don't care about the minimum + * memory size, just 32MB systems. + * + * If the filesystem blocksize is too large, we may need to choose a + * larger size since the directory code currently logs entire blocks. + */ + +STATIC void +xlog_get_iclog_buffer_size(xfs_mount_t *mp, + xlog_t *log) +{ + int size; + int xhdrs; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + /* + * When logbufs == 0, someone has disabled the log from the FSTAB + * file. This is not a documented feature. We need to set xlog_debug + * to zero (this deactivates the log) and set xlog_target to the + * appropriate device. Only one filesystem may be affected as such + * since this is just a performance hack to test what we might be able + * to get if the log were not present. + */ + if (mp->m_logbufs == 0) { + xlog_debug = 0; + xlog_target = log->l_targ; + log->l_iclog_bufs = XLOG_MIN_ICLOGS; + } else +#endif + { + /* + * This is the normal path. If m_logbufs == -1, then the + * admin has chosen to use the system defaults for logbuffers. + */ + if (mp->m_logbufs == -1) { + if (xfs_physmem <= btoc(128*1024*1024)) { + log->l_iclog_bufs = XLOG_MIN_ICLOGS; + } else if (xfs_physmem <= btoc(400*1024*1024)) { + log->l_iclog_bufs = XLOG_MED_ICLOGS; + } else { + /* 256K with 32K bufs */ + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + } + } else + log->l_iclog_bufs = mp->m_logbufs; + +#if defined(DEBUG) || defined(XLOG_NOLOG) + /* We are reactivating a filesystem after it was inactive */ + if (log->l_targ == xlog_target) { + xlog_target = NULL; + xlog_debug = 1; + } +#endif + } + + /* + * Buffer size passed in from mount system call. + */ + if (mp->m_logbsize != -1) { + size = log->l_iclog_size = mp->m_logbsize; + log->l_iclog_size_log = 0; + while (size != 1) { + log->l_iclog_size_log++; + size >>= 1; + } + + if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { + /* # headers = size / 32K + * one header holds cycles from 32K of data + */ + + xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; + if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + log->l_iclog_hsize = xhdrs << BBSHIFT; + log->l_iclog_heads = xhdrs; + } else { + ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + } + return; + } + + /* + * Special case machines that have less than 32MB of memory. + * All machines with more memory use 32KB buffers. + */ + if (xfs_physmem <= btoc(32*1024*1024)) { + /* Don't change; min configuration */ + log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */ + log->l_iclog_size_log = XLOG_RECORD_BSHIFT; + } else { + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */ + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; + } + + /* the default log size is 16k or 32k which is one header sector */ + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + + /* + * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use + * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers. + */ + if (mp->m_sb.sb_blocksize >= 16*1024) { + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; + if (mp->m_logbufs == -1) { + switch (mp->m_sb.sb_blocksize) { + case 16*1024: /* 16 KB */ + log->l_iclog_bufs = 3; + break; + case 32*1024: /* 32 KB */ + log->l_iclog_bufs = 4; + break; + case 64*1024: /* 64 KB */ + log->l_iclog_bufs = 8; + break; + default: + xlog_panic("XFS: Invalid blocksize"); + break; + } + } + } +} /* xlog_get_iclog_buffer_size */ + + +/* + * This routine initializes some of the log structure for a given mount point. + * Its primary purpose is to fill in enough, so recovery can occur. However, + * some other stuff may be filled in too. + */ +STATIC xlog_t * +xlog_alloc_log(xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + xlog_t *log; + xlog_rec_header_t *head; + xlog_in_core_t **iclogp; + xlog_in_core_t *iclog, *prev_iclog=NULL; + xfs_buf_t *bp; + int i; + int iclogsize; + + log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP); + + log->l_mp = mp; + log->l_targ = log_target; + log->l_logsize = BBTOB(num_bblks); + log->l_logBBstart = blk_offset; + log->l_logBBsize = num_bblks; + log->l_covered_state = XLOG_STATE_COVER_IDLE; + log->l_flags |= XLOG_ACTIVE_RECOVERY; + + log->l_prev_block = -1; + ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0); + /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ + log->l_last_sync_lsn = log->l_tail_lsn; + log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + log->l_grant_reserve_cycle = 1; + log->l_grant_write_cycle = 1; + + if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) { + log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; + ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); + /* for larger sector sizes, must have v2 or external log */ + ASSERT(log->l_sectbb_log == 0 || + log->l_logBBstart == 0 || + XFS_SB_VERSION_HASLOGV2(&mp->m_sb)); + ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); + } + log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + + xlog_get_iclog_buffer_size(mp, log); + + bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + log->l_xbuf = bp; + + spinlock_init(&log->l_icloglock, "iclog"); + spinlock_init(&log->l_grant_lock, "grhead_iclog"); + initnsema(&log->l_flushsema, 0, "ic-flush"); + xlog_state_ticket_alloc(log); /* wait until after icloglock inited */ + + /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ + ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); + + iclogp = &log->l_iclog; + /* + * The amount of memory to allocate for the iclog structure is + * rather funky due to the way the structure is defined. It is + * done this way so that we can use different sizes for machines + * with different amounts of memory. See the definition of + * xlog_in_core_t in xfs_log_priv.h for details. + */ + iclogsize = log->l_iclog_size; + ASSERT(log->l_iclog_size >= 4096); + for (i=0; i < log->l_iclog_bufs; i++) { + *iclogp = (xlog_in_core_t *) + kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); + iclog = *iclogp; + iclog->hic_data = (xlog_in_core_2_t *) + kmem_zalloc(iclogsize, KM_SLEEP); + + iclog->ic_prev = prev_iclog; + prev_iclog = iclog; + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + + head = &iclog->ic_header; + memset(head, 0, sizeof(xlog_rec_header_t)); + INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); + INT_SET(head->h_version, ARCH_CONVERT, + XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); + INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size); + /* new fields */ + INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); + memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + + bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + iclog->ic_bp = bp; + + iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_log = log; + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; + + ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); + ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); + sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); + sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); + + iclogp = &iclog->ic_next; + } + *iclogp = log->l_iclog; /* complete ring */ + log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + + return log; +} /* xlog_alloc_log */ + + +/* + * Write out the commit record of a transaction associated with the given + * ticket. Return the lsn of the commit record. + */ +STATIC int +xlog_commit_record(xfs_mount_t *mp, + xlog_ticket_t *ticket, + xlog_in_core_t **iclog, + xfs_lsn_t *commitlsnp) +{ + int error; + xfs_log_iovec_t reg[1]; + + reg[0].i_addr = NULL; + reg[0].i_len = 0; + + ASSERT_ALWAYS(iclog); + if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, + iclog, XLOG_COMMIT_TRANS))) { + xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); + } + return (error); +} /* xlog_commit_record */ + + +/* + * Push on the buffer cache code if we ever use more than 75% of the on-disk + * log space. This code pushes on the lsn which would supposedly free up + * the 25% which we want to leave free. We may need to adopt a policy which + * pushes on an lsn which is further along in the log once we reach the high + * water mark. In this manner, we would be creating a low water mark. + */ +void +xlog_grant_push_ail(xfs_mount_t *mp, + int need_bytes) +{ + xlog_t *log = mp->m_log; /* pointer to the log */ + xfs_lsn_t tail_lsn; /* lsn of the log tail */ + xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ + int free_blocks; /* free blocks left to write to */ + int free_bytes; /* free bytes left to write to */ + int threshold_block; /* block in lsn we'd like to be at */ + int threshold_cycle; /* lsn cycle we'd like to be at */ + int free_threshold; + SPLDECL(s); + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + s = GRANT_LOCK(log); + free_bytes = xlog_space_left(log, + log->l_grant_reserve_cycle, + log->l_grant_reserve_bytes); + tail_lsn = log->l_tail_lsn; + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks < free_threshold) { + threshold_block = BLOCK_LSN(tail_lsn) + free_threshold; + threshold_cycle = CYCLE_LSN(tail_lsn); + if (threshold_block >= log->l_logBBsize) { + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; + } + ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle, + threshold_block); + + /* Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. + */ + if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) + threshold_lsn = log->l_last_sync_lsn; + } + GRANT_UNLOCK(log, s); + + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. + */ + if (threshold_lsn && + !XLOG_FORCED_SHUTDOWN(log)) + xfs_trans_push_ail(mp, threshold_lsn); +} /* xlog_grant_push_ail */ + + +/* + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * fashion. Previously, we should have moved the current iclog + * ptr in the log to point to the next available iclog. This allows further + * write to continue while this code syncs out an iclog ready to go. + * Before an in-core log can be written out, the data section must be scanned + * to save away the 1st word of each BBSIZE block into the header. We replace + * it with the current cycle count. Each BBSIZE block is tagged with the + * cycle count because there in an implicit assumption that drives will + * guarantee that entire 512 byte blocks get written at once. In other words, + * we can't have part of a 512 byte block written and part not written. By + * tagging each block, we will know which blocks are valid when recovering + * after an unclean shutdown. + * + * This routine is single threaded on the iclog. No other thread can be in + * this routine with the same iclog. Changing contents of iclog can there- + * fore be done without grabbing the state machine lock. Updating the global + * log will require grabbing the lock though. + * + * The entire log manager uses a logical block numbering scheme. Only + * log_sync (and then only bwrite()) know about the fact that the log may + * not start with block zero on a given device. The log block start offset + * is added immediately before calling bwrite(). + */ + +int +xlog_sync(xlog_t *log, + xlog_in_core_t *iclog) +{ + xfs_caddr_t dptr; /* pointer to byte sized element */ + xfs_buf_t *bp; + int i, ops; + uint count; /* byte count of bwrite */ + uint count_init; /* initial count before roundup */ + int roundoff; /* roundoff to BB or stripe */ + int split = 0; /* split write into two regions */ + int error; + SPLDECL(s); + int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb); + + XFS_STATS_INC(xs_log_writes); + ASSERT(iclog->ic_refcnt == 0); + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + roundoff = count - count_init; + ASSERT(roundoff >= 0); + ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && + roundoff < log->l_mp->m_sb.sb_logsunit) + || + (log->l_mp->m_sb.sb_logsunit <= 1 && + roundoff < BBTOB(1))); + + /* move grant heads by roundoff in sync */ + s = GRANT_LOCK(log); + XLOG_GRANT_ADD_SPACE(log, roundoff, 'w'); + XLOG_GRANT_ADD_SPACE(log, roundoff, 'r'); + GRANT_UNLOCK(log, s); + + /* put cycle number in every block */ + xlog_pack_data(log, iclog, roundoff); + + /* real byte length */ + if (v2) { + INT_SET(iclog->ic_header.h_len, + ARCH_CONVERT, + iclog->ic_offset + roundoff); + } else { + INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset); + } + + /* put ops count in correct order */ + ops = iclog->ic_header.h_num_logops; + INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); + + bp = iclog->ic_bp; + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); + + XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + + /* Do we need to split this write into 2 parts? */ + if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); + count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); + iclog->ic_bwritecnt = 2; /* split into 2 writes */ + } else { + iclog->ic_bwritecnt = 1; + } + XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); + XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_BUSY(bp); + XFS_BUF_ASYNC(bp); + /* + * Do a disk write cache flush for the log block. + * This is a bit of a sledgehammer, it would be better + * to use a tag barrier here that just prevents reordering. + * It may not be needed to flush the first split block in the log wrap + * case, but do it anyways to be safe -AK + */ + if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH)) + XFS_BUF_FLUSH(bp); + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + xlog_verify_iclog(log, iclog, count, B_TRUE); + + /* account for log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + /* + * Don't call xfs_bwrite here. We do log-syncs even when the filesystem + * is shutting down. + */ + XFS_BUF_WRITE(bp); + + if ((error = XFS_bwrite(bp))) { + xfs_ioerror_alert("xlog_sync", log->l_mp, bp, + XFS_BUF_ADDR(bp)); + return (error); + } + if (split) { + bp = iclog->ic_log->l_xbuf; + ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == + (unsigned long)1); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); + XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ + XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ + (__psint_t)count), split); + XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_BUSY(bp); + XFS_BUF_ASYNC(bp); + if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH)) + XFS_BUF_FLUSH(bp); + dptr = XFS_BUF_PTR(bp); + /* + * Bump the cycle numbers at the start of each block + * since this part of the buffer is at the start of + * a new cycle. Watch out for the header magic number + * case, though. + */ + for (i=0; il_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + /* account for internal log which does't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + XFS_BUF_WRITE(bp); + if ((error = XFS_bwrite(bp))) { + xfs_ioerror_alert("xlog_sync (split)", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return (error); + } + } + return (0); +} /* xlog_sync */ + + +/* + * Unallocate a log structure + */ +void +xlog_unalloc_log(xlog_t *log) +{ + xlog_in_core_t *iclog, *next_iclog; + xlog_ticket_t *tic, *next_tic; + int i; + + + iclog = log->l_iclog; + for (i=0; il_iclog_bufs; i++) { + sv_destroy(&iclog->ic_forcesema); + sv_destroy(&iclog->ic_writesema); + xfs_buf_free(iclog->ic_bp); +#ifdef XFS_LOG_TRACE + if (iclog->ic_trace != NULL) { + ktrace_free(iclog->ic_trace); + } +#endif + next_iclog = iclog->ic_next; + kmem_free(iclog->hic_data, log->l_iclog_size); + kmem_free(iclog, sizeof(xlog_in_core_t)); + iclog = next_iclog; + } + freesema(&log->l_flushsema); + spinlock_destroy(&log->l_icloglock); + spinlock_destroy(&log->l_grant_lock); + + /* XXXsup take a look at this again. */ + if ((log->l_ticket_cnt != log->l_ticket_tcnt) && + !XLOG_FORCED_SHUTDOWN(log)) { + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "xlog_unalloc_log: (cnt: %d, total: %d)", + log->l_ticket_cnt, log->l_ticket_tcnt); + /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ + + } else { + tic = log->l_unmount_free; + while (tic) { + next_tic = tic->t_next; + kmem_free(tic, NBPP); + tic = next_tic; + } + } + xfs_buf_free(log->l_xbuf); +#ifdef XFS_LOG_TRACE + if (log->l_trace != NULL) { + ktrace_free(log->l_trace); + } + if (log->l_grant_trace != NULL) { + ktrace_free(log->l_grant_trace); + } +#endif + log->l_mp->m_log = NULL; + kmem_free(log, sizeof(xlog_t)); +} /* xlog_unalloc_log */ + +/* + * Update counters atomically now that memcpy is done. + */ +/* ARGSUSED */ +static inline void +xlog_state_finish_copy(xlog_t *log, + xlog_in_core_t *iclog, + int record_cnt, + int copy_bytes) +{ + SPLDECL(s); + + s = LOG_LOCK(log); + + iclog->ic_header.h_num_logops += record_cnt; + iclog->ic_offset += copy_bytes; + + LOG_UNLOCK(log, s); +} /* xlog_state_finish_copy */ + + + + +/* + * Write some region out to in-core log + * + * This will be called when writing externally provided regions or when + * writing out a commit record for a given transaction. + * + * General algorithm: + * 1. Find total length of this write. This may include adding to the + * lengths passed in. + * 2. Check whether we violate the tickets reservation. + * 3. While writing to this iclog + * A. Reserve as much space in this iclog as can get + * B. If this is first write, save away start lsn + * C. While writing this region: + * 1. If first write of transaction, write start record + * 2. Write log operation header (header per region) + * 3. Find out if we can fit entire region into this iclog + * 4. Potentially, verify destination memcpy ptr + * 5. Memcpy (partial) region + * 6. If partial copy, release iclog; otherwise, continue + * copying more regions into current iclog + * 4. Mark want sync bit (in simulation mode) + * 5. Release iclog for potential flush to on-disk log. + * + * ERRORS: + * 1. Panic if reservation is overrun. This should never happen since + * reservation amounts are generated internal to the filesystem. + * NOTES: + * 1. Tickets are single threaded data structures. + * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the + * syncing routine. When a single log_write region needs to span + * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set + * on all log operation writes which don't contain the end of the + * region. The XLOG_END_TRANS bit is used for the in-core log + * operation which contains the end of the continued log_write region. + * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, + * we don't really know exactly how much space will be used. As a result, + * we don't update ic_offset until the end when we know exactly how many + * bytes have been written out. + */ +int +xlog_write(xfs_mount_t * mp, + xfs_log_iovec_t reg[], + int nentries, + xfs_log_ticket_t tic, + xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, + uint flags) +{ + xlog_t *log = mp->m_log; + xlog_ticket_t *ticket = (xlog_ticket_t *)tic; + xlog_op_header_t *logop_head; /* ptr to log operation header */ + xlog_in_core_t *iclog; /* ptr to current in-core log */ + __psint_t ptr; /* copy address into data region */ + int len; /* # xlog_write() bytes 2 still copy */ + int index; /* region index currently copying */ + int log_offset; /* offset (from 0) into data region */ + int start_rec_copy; /* # bytes to copy for start record */ + int partial_copy; /* did we split a region? */ + int partial_copy_len;/* # bytes copied if split region */ + int need_copy; /* # bytes need to memcpy this region */ + int copy_len; /* # bytes actually memcpy'ing */ + int copy_off; /* # bytes from entry start */ + int contwr; /* continued write of in-core log? */ + int error; + int record_cnt = 0, data_cnt = 0; + + partial_copy_len = partial_copy = 0; + + /* Calculate potential maximum space. Each region gets its own + * xlog_op_header_t and may need to be double word aligned. + */ + len = 0; + if (ticket->t_flags & XLOG_TIC_INITED) /* acct for start rec of xact */ + len += sizeof(xlog_op_header_t); + + for (index = 0; index < nentries; index++) { + len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ + len += reg[index].i_len; + } + contwr = *start_lsn = 0; + + if (ticket->t_curr_res < len) { +#ifdef DEBUG + xlog_panic( + "xfs_log_write: reservation ran out. Need to up reservation"); +#else + /* Customer configurable panic */ + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + /* If we did not panic, shutdown the filesystem */ + xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); +#endif + } else + ticket->t_curr_res -= len; + + for (index = 0; index < nentries; ) { + if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset))) + return (error); + + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + + /* start_lsn is the first lsn written to. That's all we need. */ + if (! *start_lsn) + *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + + /* This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (index < nentries) { + ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); + ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); + start_rec_copy = 0; + + /* If first write for transaction, insert start record. + * We can't be trying to commit if we are inited. We can't + * have any "partial_copy" if we are inited. + */ + if (ticket->t_flags & XLOG_TIC_INITED) { + logop_head = (xlog_op_header_t *)ptr; + INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); + logop_head->oh_clientid = ticket->t_clientid; + logop_head->oh_len = 0; + logop_head->oh_flags = XLOG_START_TRANS; + logop_head->oh_res2 = 0; + ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ + record_cnt++; + + start_rec_copy = sizeof(xlog_op_header_t); + xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); + } + + /* Copy log operation header directly into data section */ + logop_head = (xlog_op_header_t *)ptr; + INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); + logop_head->oh_clientid = ticket->t_clientid; + logop_head->oh_res2 = 0; + + /* header copied directly */ + xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + + /* are we copying a commit or unmount record? */ + logop_head->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client + * ids. This makes sure that XFS doesn't generate them on. + * Turn this into an EIO and shut down the filesystem. + */ + switch (logop_head->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_fs_cmn_err(CE_WARN, mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + logop_head->oh_clientid, tic); + return XFS_ERROR(EIO); + } + + /* Partial write last time? => (partial_copy != 0) + * need_copy is the amount we'd like to copy if everything could + * fit in the current memcpy. + */ + need_copy = reg[index].i_len - partial_copy_len; + + copy_off = partial_copy_len; + if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ + INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy); + if (partial_copy) + logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + partial_copy_len = partial_copy = 0; + } else { /* partial write */ + copy_len = iclog->ic_size - log_offset; + INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len); + logop_head->oh_flags |= XLOG_CONTINUE_TRANS; + if (partial_copy) + logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; + partial_copy_len += copy_len; + partial_copy++; + len += sizeof(xlog_op_header_t); /* from splitting of region */ + /* account for new log op header */ + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); + xlog_write_adv_cnt(ptr, len, log_offset, copy_len); + + /* make copy_len total bytes copied, including headers */ + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + if (partial_copy) { /* copied partial region */ + /* already marked WANT_SYNC by xlog_state_get_iclog_space */ + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + record_cnt = data_cnt = 0; + if ((error = xlog_state_release_iclog(log, iclog))) + return (error); + break; /* don't increment index */ + } else { /* copied entire region */ + index++; + partial_copy_len = partial_copy = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + record_cnt = data_cnt = 0; + xlog_state_want_sync(log, iclog); + if (commit_iclog) { + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } else if ((error = xlog_state_release_iclog(log, iclog))) + return (error); + if (index == nentries) + return 0; /* we are done */ + else + break; + } + } /* if (partial_copy) */ + } /* while (index < nentries) */ + } /* for (index = 0; index < nentries; ) */ + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (commit_iclog) { + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + return 0; + } + return (xlog_state_release_iclog(log, iclog)); +} /* xlog_write */ + + +/***************************************************************************** + * + * State Machine functions + * + ***************************************************************************** + */ + +/* Clean iclogs starting from the head. This ordering must be + * maintained, so an iclog doesn't become ACTIVE beyond one that + * is SYNCING. This is also required to maintain the notion that we use + * a counting semaphore to hold off would be writers to the log when every + * iclog is trying to sync to disk. + * + * State Change: DIRTY -> ACTIVE + */ +void +xlog_state_clean_log(xlog_t *log) +{ + xlog_in_core_t *iclog; + int changed = 0; + + iclog = log->l_iclog; + do { + if (iclog->ic_state == XLOG_STATE_DIRTY) { + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_callback = NULL; /* don't need to free */ + /* + * If the number of ops in this iclog indicate it just + * contains the dummy transaction, we can + * change state into IDLE (the second time around). + * Otherwise we should change the state into + * NEED a dummy. + * We don't need to cover the dummy. + */ + if (!changed && + (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) { + changed = 1; + } else { + /* + * We have two dirty iclogs so start over + * This could also be num of ops indicates + * this is not the dummy going out. + */ + changed = 2; + } + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) + /* do nothing */; + else + break; /* stop cleaning */ + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + + /* log is locked when we are called */ + /* + * Change state for the dummy log recording. + * We usually go to NEED. But we go to NEED2 if the changed indicates + * we are done writing the dummy record. + * If we are done with the second dummy recored (DONE2), then + * we go to IDLE. + */ + if (changed) { + switch (log->l_covered_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_NEED2; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE2: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_IDLE; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + default: + ASSERT(0); + } + } +} /* xlog_state_clean_log */ + +STATIC xfs_lsn_t +xlog_get_lowest_lsn( + xlog_t *log) +{ + xlog_in_core_t *lsn_log; + xfs_lsn_t lowest_lsn, lsn; + + lsn_log = log->l_iclog; + lowest_lsn = 0; + do { + if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { + lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT); + if ((lsn && !lowest_lsn) || + (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + lowest_lsn = lsn; + } + } + lsn_log = lsn_log->ic_next; + } while (lsn_log != log->l_iclog); + return(lowest_lsn); +} + + +STATIC void +xlog_state_do_callback( + xlog_t *log, + int aborted, + xlog_in_core_t *ciclog) +{ + xlog_in_core_t *iclog; + xlog_in_core_t *first_iclog; /* used to know when we've + * processed all iclogs once */ + xfs_log_callback_t *cb, *cb_next; + int flushcnt = 0; + xfs_lsn_t lowest_lsn; + int ioerrors; /* counter: iclogs with errors */ + int loopdidcallbacks; /* flag: inner loop did callbacks*/ + int funcdidcallbacks; /* flag: function did callbacks */ + int repeats; /* for issuing console warnings if + * looping too many times */ + SPLDECL(s); + + s = LOG_LOCK(log); + first_iclog = iclog = log->l_iclog; + ioerrors = 0; + funcdidcallbacks = 0; + repeats = 0; + + do { + /* + * Scan all iclogs starting with the one pointed to by the + * log. Reset this starting point each time the log is + * unlocked (during callbacks). + * + * Keep looping through iclogs until one full pass is made + * without running any callbacks. + */ + first_iclog = log->l_iclog; + iclog = log->l_iclog; + loopdidcallbacks = 0; + repeats++; + + do { + + /* skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & + (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + iclog = iclog->ic_next; + continue; + } + + /* + * Between marking a filesystem SHUTDOWN and stopping + * the log, we do flush all iclogs to disk (if there + * wasn't a log I/O error). So, we do want things to + * go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Can only perform callbacks in order. Since + * this iclog is not in the DONE_SYNC/ + * DO_CALLBACK state, we skip the rest and + * just try to clean up. If we set our iclog + * to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the + * CALLBACK and the state cannot change since + * we are holding the LOG_LOCK. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | + XLOG_STATE_DO_CALLBACK))) { + if (ciclog && (ciclog->ic_state == + XLOG_STATE_DONE_SYNC)) { + ciclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + break; + } + /* + * We now have an iclog that is in either the + * DO_CALLBACK or DONE_SYNC states. The other + * states (WANT_SYNC, SYNCING, or CALLBACK were + * caught by the above if and are going to + * clean (i.e. we aren't doing their callbacks) + * see the above if. + */ + + /* + * We will do one more check here to see if we + * have chased our tail around. + */ + + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && ( + XFS_LSN_CMP( + lowest_lsn, + INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) + )<0)) { + iclog = iclog->ic_next; + continue; /* Leave this iclog for + * another thread */ + } + + iclog->ic_state = XLOG_STATE_CALLBACK; + + LOG_UNLOCK(log, s); + + /* l_last_sync_lsn field protected by + * GRANT_LOCK. Don't worry about iclog's lsn. + * No one else can be here except us. + */ + s = GRANT_LOCK(log); + ASSERT(XFS_LSN_CMP( + log->l_last_sync_lsn, + INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) + )<=0); + log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + GRANT_UNLOCK(log, s); + + /* + * Keep processing entries in the callback list + * until we come around and it is empty. We + * need to atomically see that the list is + * empty and change the state to DIRTY so that + * we don't miss any more callbacks being added. + */ + s = LOG_LOCK(log); + } else { + ioerrors++; + } + cb = iclog->ic_callback; + + while (cb != 0) { + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_callback = NULL; + LOG_UNLOCK(log, s); + + /* perform callbacks in the order given */ + for (; cb != 0; cb = cb_next) { + cb_next = cb->cb_next; + cb->cb_func(cb->cb_arg, aborted); + } + s = LOG_LOCK(log); + cb = iclog->ic_callback; + } + + loopdidcallbacks++; + funcdidcallbacks++; + + ASSERT(iclog->ic_callback == 0); + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) + iclog->ic_state = XLOG_STATE_DIRTY; + + /* + * Transition from DIRTY to ACTIVE if applicable. + * NOP if STATE_IOERROR. + */ + xlog_state_clean_log(log); + + /* wake up threads waiting in xfs_log_force() */ + sv_broadcast(&iclog->ic_forcesema); + + iclog = iclog->ic_next; + } while (first_iclog != iclog); + if (repeats && (repeats % 10) == 0) { + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "xlog_state_do_callback: looping %d", repeats); + } + } while (!ioerrors && loopdidcallbacks); + + /* + * make one last gasp attempt to see if iclogs are being left in + * limbo.. + */ +#ifdef DEBUG + if (funcdidcallbacks) { + first_iclog = iclog = log->l_iclog; + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * LOG_LOCK + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); + } +#endif + + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { + flushcnt = log->l_flushcnt; + log->l_flushcnt = 0; + } + LOG_UNLOCK(log, s); + while (flushcnt--) + vsema(&log->l_flushsema); +} /* xlog_state_do_callback */ + + +/* + * Finish transitioning this iclog to the dirty state. + * + * Make sure that we completely execute this routine only when this is + * the last call to the iclog. There is a good chance that iclog flushes, + * when we reach the end of the physical log, get turned into 2 separate + * calls to bwrite. Hence, one iclog flush could generate two calls to this + * routine. By using the reference count bwritecnt, we guarantee that only + * the second completion goes through. + * + * Callbacks could take time, so they are done outside the scope of the + * global state machine log lock. Assume that the calls to cvsema won't + * take a long time. At least we know it won't sleep. + */ +void +xlog_state_done_syncing( + xlog_in_core_t *iclog, + int aborted) +{ + xlog_t *log = iclog->ic_log; + SPLDECL(s); + + s = LOG_LOCK(log); + + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_IOERROR); + ASSERT(iclog->ic_refcnt == 0); + ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); + + + /* + * If we got an error, either on the first buffer, or in the case of + * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, + * and none should ever be attempted to be written to disk + * again. + */ + if (iclog->ic_state != XLOG_STATE_IOERROR) { + if (--iclog->ic_bwritecnt == 1) { + LOG_UNLOCK(log, s); + return; + } + iclog->ic_state = XLOG_STATE_DONE_SYNC; + } + + /* + * Someone could be sleeping prior to writing out the next + * iclog buffer, we wake them all, one will get to do the + * I/O, the others get to wait for the result. + */ + sv_broadcast(&iclog->ic_writesema); + LOG_UNLOCK(log, s); + xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ +} /* xlog_state_done_syncing */ + + +/* + * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must + * sleep. The flush semaphore is set to the number of in-core buffers and + * decremented around disk syncing. Therefore, if all buffers are syncing, + * this semaphore will cause new writes to sleep until a sync completes. + * Otherwise, this code just does p() followed by v(). This approximates + * a sleep/wakeup except we can't race. + * + * The in-core logs are used in a circular fashion. They are not used + * out-of-order even when an iclog past the head is free. + * + * return: + * * log_offset where xlog_write() can start writing into the in-core + * log's data space. + * * in-core log pointer to which xlog_write() should write. + * * boolean indicating this is a continued write to an in-core log. + * If this is the last write, then the in-core log's offset field + * needs to be incremented, depending on the amount of data which + * is copied. + */ +int +xlog_state_get_iclog_space(xlog_t *log, + int len, + xlog_in_core_t **iclogp, + xlog_ticket_t *ticket, + int *continued_write, + int *logoffsetp) +{ + SPLDECL(s); + int log_offset; + xlog_rec_header_t *head; + xlog_in_core_t *iclog; + int error; + +restart: + s = LOG_LOCK(log); + if (XLOG_FORCED_SHUTDOWN(log)) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + + iclog = log->l_iclog; + if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { + log->l_flushcnt++; + LOG_UNLOCK(log, s); + xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); + XFS_STATS_INC(xs_log_noiclogs); + /* Ensure that log writes happen */ + psema(&log->l_flushsema, PINOD); + goto restart; + } + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + head = &iclog->ic_header; + + iclog->ic_refcnt++; /* prevents sync */ + log_offset = iclog->ic_offset; + + /* On the 1st write to an iclog, figure out lsn. This works + * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are + * committing to. If the offset is set, that's how many blocks + * must be written. + */ + if (log_offset == 0) { + ticket->t_curr_res -= log->l_iclog_hsize; + INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle); + ASSIGN_LSN(head->h_lsn, log); + ASSERT(log->l_curr_block >= 0); + } + + /* If there is enough room to write everything, then do it. Otherwise, + * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC + * bit is on, so this will get flushed out. Don't update ic_offset + * until you know exactly how many bytes get copied. Therefore, wait + * until later to update ic_offset. + * + * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * can fit into remaining data section. + */ + if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + + /* If I'm the only one writing to this iclog, sync it to disk */ + if (iclog->ic_refcnt == 1) { + LOG_UNLOCK(log, s); + if ((error = xlog_state_release_iclog(log, iclog))) + return (error); + } else { + iclog->ic_refcnt--; + LOG_UNLOCK(log, s); + } + goto restart; + } + + /* Do we have enough room to write the full amount in the remainder + * of this iclog? Or must we continue a write on the next iclog and + * mark this iclog as completely taken? In the case where we switch + * iclogs (to mark it taken), this particular iclog will release/sync + * to disk in xlog_write(). + */ + if (len <= iclog->ic_size - iclog->ic_offset) { + *continued_write = 0; + iclog->ic_offset += len; + } else { + *continued_write = 1; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + } + *iclogp = iclog; + + ASSERT(iclog->ic_offset <= iclog->ic_size); + LOG_UNLOCK(log, s); + + *logoffsetp = log_offset; + return 0; +} /* xlog_state_get_iclog_space */ + +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto the reserveq, it will only return after + * the needed reservation is satisfied. + */ +STATIC int +xlog_grant_log_space(xlog_t *log, + xlog_ticket_t *tic) +{ + int free_bytes; + int need_bytes; + SPLDECL(s); +#ifdef DEBUG + xfs_lsn_t tail_lsn; +#endif + + +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("grant Recovery problem"); +#endif + + /* Is there space or do we need to sleep? */ + s = GRANT_LOCK(log); + xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); + + /* something is already sleeping; insert new transaction at end */ + if (log->l_reserve_headq) { + XLOG_INS_TICKETQ(log->l_reserve_headq, tic); + xlog_trace_loggrant(log, tic, + "xlog_grant_log_space: sleep 1"); + /* + * Gotta check this before going to sleep, while we're + * holding the grant lock. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + XFS_STATS_INC(xs_sleep_logspace); + sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + /* + * If we got an error, and the filesystem is shutting down, + * we'll catch it down below. So just continue... + */ + xlog_trace_loggrant(log, tic, + "xlog_grant_log_space: wake 1"); + s = GRANT_LOCK(log); + } + if (tic->t_flags & XFS_LOG_PERM_RESERV) + need_bytes = tic->t_unit_res*tic->t_ocnt; + else + need_bytes = tic->t_unit_res; + +redo: + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, + log->l_grant_reserve_bytes); + if (free_bytes < need_bytes) { + if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) + XLOG_INS_TICKETQ(log->l_reserve_headq, tic); + xlog_trace_loggrant(log, tic, + "xlog_grant_log_space: sleep 2"); + XFS_STATS_INC(xs_sleep_logspace); + sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + + if (XLOG_FORCED_SHUTDOWN(log)) { + s = GRANT_LOCK(log); + goto error_return; + } + + xlog_trace_loggrant(log, tic, + "xlog_grant_log_space: wake 2"); + xlog_grant_push_ail(log->l_mp, need_bytes); + s = GRANT_LOCK(log); + goto redo; + } else if (tic->t_flags & XLOG_TIC_IN_Q) + XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + + /* we've got enough space */ + XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); + XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r'); +#ifdef DEBUG + tail_lsn = log->l_tail_lsn; + /* + * Check to make sure the grant write head didn't just over lap the + * tail. If the cycles are the same, we can't be overlapping. + * Otherwise, make sure that the cycles differ by exactly one and + * check the byte count. + */ + if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { + ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); + ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + } +#endif + xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); + xlog_verify_grant_head(log, 1); + GRANT_UNLOCK(log, s); + return 0; + + error_return: + if (tic->t_flags & XLOG_TIC_IN_Q) + XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back when + * the ticket/transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + GRANT_UNLOCK(log, s); + return XFS_ERROR(EIO); +} /* xlog_grant_log_space */ + + +/* + * Replenish the byte reservation required by moving the grant write head. + * + * + */ +STATIC int +xlog_regrant_write_log_space(xlog_t *log, + xlog_ticket_t *tic) +{ + SPLDECL(s); + int free_bytes, need_bytes; + xlog_ticket_t *ntic; +#ifdef DEBUG + xfs_lsn_t tail_lsn; +#endif + + tic->t_curr_res = tic->t_unit_res; + + if (tic->t_cnt > 0) + return (0); + +#ifdef DEBUG + if (log->l_flags & XLOG_ACTIVE_RECOVERY) + panic("regrant Recovery problem"); +#endif + + s = GRANT_LOCK(log); + xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); + + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + /* If there are other waiters on the queue then give them a + * chance at logspace before us. Wake up the first waiters, + * if we do not wake up all the waiters then go to sleep waiting + * for more free space, otherwise try to get some space for + * this transaction. + */ + + if ((ntic = log->l_write_headq)) { + free_bytes = xlog_space_left(log, log->l_grant_write_cycle, + log->l_grant_write_bytes); + do { + ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); + + if (free_bytes < ntic->t_unit_res) + break; + free_bytes -= ntic->t_unit_res; + sv_signal(&ntic->t_sema); + ntic = ntic->t_next; + } while (ntic != log->l_write_headq); + + if (ntic != log->l_write_headq) { + if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) + XLOG_INS_TICKETQ(log->l_write_headq, tic); + + xlog_trace_loggrant(log, tic, + "xlog_regrant_write_log_space: sleep 1"); + XFS_STATS_INC(xs_sleep_logspace); + sv_wait(&tic->t_sema, PINOD|PLTWAIT, + &log->l_grant_lock, s); + + /* If we're shutting down, this tic is already + * off the queue */ + if (XLOG_FORCED_SHUTDOWN(log)) { + s = GRANT_LOCK(log); + goto error_return; + } + + xlog_trace_loggrant(log, tic, + "xlog_regrant_write_log_space: wake 1"); + xlog_grant_push_ail(log->l_mp, tic->t_unit_res); + s = GRANT_LOCK(log); + } + } + + need_bytes = tic->t_unit_res; + +redo: + if (XLOG_FORCED_SHUTDOWN(log)) + goto error_return; + + free_bytes = xlog_space_left(log, log->l_grant_write_cycle, + log->l_grant_write_bytes); + if (free_bytes < need_bytes) { + if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) + XLOG_INS_TICKETQ(log->l_write_headq, tic); + XFS_STATS_INC(xs_sleep_logspace); + sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); + + /* If we're shutting down, this tic is already off the queue */ + if (XLOG_FORCED_SHUTDOWN(log)) { + s = GRANT_LOCK(log); + goto error_return; + } + + xlog_trace_loggrant(log, tic, + "xlog_regrant_write_log_space: wake 2"); + xlog_grant_push_ail(log->l_mp, need_bytes); + s = GRANT_LOCK(log); + goto redo; + } else if (tic->t_flags & XLOG_TIC_IN_Q) + XLOG_DEL_TICKETQ(log->l_write_headq, tic); + + XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */ +#ifdef DEBUG + tail_lsn = log->l_tail_lsn; + if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { + ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); + ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); + } +#endif + + xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); + xlog_verify_grant_head(log, 1); + GRANT_UNLOCK(log, s); + return (0); + + + error_return: + if (tic->t_flags & XLOG_TIC_IN_Q) + XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back when + * the ticket/transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + GRANT_UNLOCK(log, s); + return XFS_ERROR(EIO); +} /* xlog_regrant_write_log_space */ + + +/* The first cnt-1 times through here we don't need to + * move the grant write head because the permanent + * reservation has reserved cnt times the unit amount. + * Release part of current permanent unit reservation and + * reset current reservation to be one units worth. Also + * move grant reservation head forward. + */ +STATIC void +xlog_regrant_reserve_log_space(xlog_t *log, + xlog_ticket_t *ticket) +{ + SPLDECL(s); + + xlog_trace_loggrant(log, ticket, + "xlog_regrant_reserve_log_space: enter"); + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + s = GRANT_LOCK(log); + XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); + XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r'); + ticket->t_curr_res = ticket->t_unit_res; + xlog_trace_loggrant(log, ticket, + "xlog_regrant_reserve_log_space: sub current res"); + xlog_verify_grant_head(log, 1); + + /* just return if we still have some of the pre-reserved space */ + if (ticket->t_cnt > 0) { + GRANT_UNLOCK(log, s); + return; + } + + XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r'); + xlog_trace_loggrant(log, ticket, + "xlog_regrant_reserve_log_space: exit"); + xlog_verify_grant_head(log, 0); + GRANT_UNLOCK(log, s); + ticket->t_curr_res = ticket->t_unit_res; +} /* xlog_regrant_reserve_log_space */ + + +/* + * Give back the space left from a reservation. + * + * All the information we need to make a correct determination of space left + * is present. For non-permanent reservations, things are quite easy. The + * count should have been decremented to zero. We only need to deal with the + * space remaining in the current reservation part of the ticket. If the + * ticket contains a permanent reservation, there may be left over space which + * needs to be released. A count of N means that N-1 refills of the current + * reservation can be done before we need to ask for more space. The first + * one goes to fill up the first current reservation. Once we run out of + * space, the count will stay at zero and the only space remaining will be + * in the current reservation field. + */ +STATIC void +xlog_ungrant_log_space(xlog_t *log, + xlog_ticket_t *ticket) +{ + SPLDECL(s); + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + s = GRANT_LOCK(log); + xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); + + XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); + XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r'); + + xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); + + /* If this is a permanent reservation ticket, we may be able to free + * up more space based on the remaining count. + */ + if (ticket->t_cnt > 0) { + ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); + XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w'); + XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r'); + } + + xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); + xlog_verify_grant_head(log, 1); + GRANT_UNLOCK(log, s); + xfs_log_move_tail(log->l_mp, 1); +} /* xlog_ungrant_log_space */ + + +/* + * Atomically put back used ticket. + */ +void +xlog_state_put_ticket(xlog_t *log, + xlog_ticket_t *tic) +{ + unsigned long s; + + s = LOG_LOCK(log); + xlog_ticket_put(log, tic); + LOG_UNLOCK(log, s); +} /* xlog_state_put_ticket */ + +/* + * Flush iclog to disk if this is the last reference to the given iclog and + * the WANT_SYNC bit is set. + * + * When this function is entered, the iclog is not necessarily in the + * WANT_SYNC state. It may be sitting around waiting to get filled. + * + * + */ +int +xlog_state_release_iclog(xlog_t *log, + xlog_in_core_t *iclog) +{ + SPLDECL(s); + int sync = 0; /* do we sync? */ + + xlog_assign_tail_lsn(log->l_mp); + + s = LOG_LOCK(log); + + if (iclog->ic_state & XLOG_STATE_IOERROR) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + + ASSERT(iclog->ic_refcnt > 0); + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_WANT_SYNC); + + if (--iclog->ic_refcnt == 0 && + iclog->ic_state == XLOG_STATE_WANT_SYNC) { + sync++; + iclog->ic_state = XLOG_STATE_SYNCING; + INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn); + xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); + /* cycle incremented when incrementing curr_block */ + } + + LOG_UNLOCK(log, s); + + /* + * We let the log lock go, so it's possible that we hit a log I/O + * error or someother SHUTDOWN condition that marks the iclog + * as XLOG_STATE_IOERROR before the bwrite. However, we know that + * this iclog has consistent data, so we ignore IOERROR + * flags after this point. + */ + if (sync) { + return xlog_sync(log, iclog); + } + return (0); + +} /* xlog_state_release_iclog */ + + +/* + * This routine will mark the current iclog in the ring as WANT_SYNC + * and move the current iclog pointer to the next iclog in the ring. + * When this routine is called from xlog_state_get_iclog_space(), the + * exact size of the iclog has not yet been determined. All we know is + * that every data block. We have run out of space in this log record. + */ +STATIC void +xlog_state_switch_iclogs(xlog_t *log, + xlog_in_core_t *iclog, + int eventual_size) +{ + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + if (!eventual_size) + eventual_size = iclog->ic_offset; + iclog->ic_state = XLOG_STATE_WANT_SYNC; + INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block); + log->l_prev_block = log->l_curr_block; + log->l_prev_cycle = log->l_curr_cycle; + + /* roll log?: ic_offset changed later */ + log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); + + /* Round up to next log-sunit */ + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + } + + if (log->l_curr_block >= log->l_logBBsize) { + log->l_curr_cycle++; + if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) + log->l_curr_cycle++; + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + } + ASSERT(iclog == log->l_iclog); + log->l_iclog = iclog->ic_next; +} /* xlog_state_switch_iclogs */ + + +/* + * Write out all data in the in-core log as of this exact moment in time. + * + * Data may be written to the in-core log during this call. However, + * we don't guarantee this data will be written out. A change from past + * implementation means this routine will *not* write out zero length LRs. + * + * Basically, we try and perform an intelligent scan of the in-core logs. + * If we determine there is no flushable data, we just return. There is no + * flushable data if: + * + * 1. the current iclog is active and has no data; the previous iclog + * is in the active or dirty state. + * 2. the current iclog is drity, and the previous iclog is in the + * active or dirty state. + * + * We may sleep (call psema) if: + * + * 1. the current iclog is not in the active nor dirty state. + * 2. the current iclog dirty, and the previous iclog is not in the + * active nor dirty state. + * 3. the current iclog is active, and there is another thread writing + * to this particular iclog. + * 4. a) the current iclog is active and has no other writers + * b) when we return from flushing out this iclog, it is still + * not in the active nor dirty state. + */ +STATIC int +xlog_state_sync_all(xlog_t *log, uint flags) +{ + xlog_in_core_t *iclog; + xfs_lsn_t lsn; + SPLDECL(s); + + s = LOG_LOCK(log); + + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + + /* If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) { + /* + * If the head is dirty or (active and empty), then + * we need to look at the previous iclog. If the previous + * iclog is active or dirty we are done. There is nothing + * to sync out. Otherwise, we attach ourselves to the + * previous iclog and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_DIRTY || + (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto no_sleep; + else + goto maybe_sleep; + } else { + if (iclog->ic_refcnt == 0) { + /* We are the only one with access to this + * iclog. Flush it out now. There should + * be a roundoff of zero to show that someone + * has already taken care of the roundoff from + * the previous sync. + */ + iclog->ic_refcnt++; + lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + xlog_state_switch_iclogs(log, iclog, 0); + LOG_UNLOCK(log, s); + + if (xlog_state_release_iclog(log, iclog)) + return XFS_ERROR(EIO); + s = LOG_LOCK(log); + if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn && + iclog->ic_state != XLOG_STATE_DIRTY) + goto maybe_sleep; + else + goto no_sleep; + } else { + /* Someone else is writing to this iclog. + * Use its call to flush out the data. However, + * the other thread may not force out this LR, + * so we mark it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); + goto maybe_sleep; + } + } + } + + /* By the time we come around again, the iclog could've been filled + * which would give it another lsn. If we have a new lsn, just + * return because the relevant data has been flushed. + */ +maybe_sleep: + if (flags & XFS_LOG_SYNC) { + /* + * We must check if we're shutting down here, before + * we wait, while we're holding the LOG_LOCK. + * Then we check again after waking up, in case our + * sleep was disturbed by a bad news. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + + } else { + +no_sleep: + LOG_UNLOCK(log, s); + } + return 0; +} /* xlog_state_sync_all */ + + +/* + * Used by code which implements synchronous log forces. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * If filesystem activity goes to zero, the iclog will get flushed only by + * bdflush(). + */ +int +xlog_state_sync(xlog_t *log, + xfs_lsn_t lsn, + uint flags) +{ + xlog_in_core_t *iclog; + int already_slept = 0; + SPLDECL(s); + + +try_again: + s = LOG_LOCK(log); + iclog = log->l_iclog; + + if (iclog->ic_state & XLOG_STATE_IOERROR) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + + do { + if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + LOG_UNLOCK(log, s); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which drops + * the ref count). The state switch keeps new transaction + * commits from using this buffer. When the current commits + * finish writing into the buffer, the refcount will drop to + * zero and the buffer will go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | + XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + XFS_STATS_INC(xs_log_force_sleep); + sv_wait(&iclog->ic_prev->ic_writesema, PSWP, + &log->l_icloglock, s); + already_slept = 1; + goto try_again; + } else { + iclog->ic_refcnt++; + xlog_state_switch_iclogs(log, iclog, 0); + LOG_UNLOCK(log, s); + if (xlog_state_release_iclog(log, iclog)) + return XFS_ERROR(EIO); + s = LOG_LOCK(log); + } + } + + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + + /* + * Don't wait on the forcesema if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + LOG_UNLOCK(log, s); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + } else { /* just return */ + LOG_UNLOCK(log, s); + } + return 0; + + } while (iclog != log->l_iclog); + + LOG_UNLOCK(log, s); + return (0); +} /* xlog_state_sync */ + + +/* + * Called when we want to mark the current iclog as being ready to sync to + * disk. + */ +void +xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +{ + SPLDECL(s); + + s = LOG_LOCK(log); + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + xlog_state_switch_iclogs(log, iclog, 0); + } else { + ASSERT(iclog->ic_state & + (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); + } + + LOG_UNLOCK(log, s); +} /* xlog_state_want_sync */ + + + +/***************************************************************************** + * + * TICKET functions + * + ***************************************************************************** + */ + +/* + * Algorithm doesn't take into account page size. ;-( + */ +STATIC void +xlog_state_ticket_alloc(xlog_t *log) +{ + xlog_ticket_t *t_list; + xlog_ticket_t *next; + xfs_caddr_t buf; + uint i = (NBPP / sizeof(xlog_ticket_t)) - 2; + SPLDECL(s); + + /* + * The kmem_zalloc may sleep, so we shouldn't be holding the + * global lock. XXXmiken: may want to use zone allocator. + */ + buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP); + + s = LOG_LOCK(log); + + /* Attach 1st ticket to Q, so we can keep track of allocated memory */ + t_list = (xlog_ticket_t *)buf; + t_list->t_next = log->l_unmount_free; + log->l_unmount_free = t_list++; + log->l_ticket_cnt++; + log->l_ticket_tcnt++; + + /* Next ticket becomes first ticket attached to ticket free list */ + if (log->l_freelist != NULL) { + ASSERT(log->l_tail != NULL); + log->l_tail->t_next = t_list; + } else { + log->l_freelist = t_list; + } + log->l_ticket_cnt++; + log->l_ticket_tcnt++; + + /* Cycle through rest of alloc'ed memory, building up free Q */ + for ( ; i > 0; i--) { + next = t_list + 1; + t_list->t_next = next; + t_list = next; + log->l_ticket_cnt++; + log->l_ticket_tcnt++; + } + t_list->t_next = NULL; + log->l_tail = t_list; + LOG_UNLOCK(log, s); +} /* xlog_state_ticket_alloc */ + + +/* + * Put ticket into free list + * + * Assumption: log lock is held around this call. + */ +STATIC void +xlog_ticket_put(xlog_t *log, + xlog_ticket_t *ticket) +{ + sv_destroy(&ticket->t_sema); + + /* + * Don't think caching will make that much difference. It's + * more important to make debug easier. + */ +#if 0 + /* real code will want to use LIFO for caching */ + ticket->t_next = log->l_freelist; + log->l_freelist = ticket; + /* no need to clear fields */ +#else + /* When we debug, it is easier if tickets are cycled */ + ticket->t_next = NULL; + if (log->l_tail != 0) { + log->l_tail->t_next = ticket; + } else { + ASSERT(log->l_freelist == 0); + log->l_freelist = ticket; + } + log->l_tail = ticket; +#endif /* DEBUG */ + log->l_ticket_cnt++; +} /* xlog_ticket_put */ + + +/* + * Grab ticket off freelist or allocation some more + */ +xlog_ticket_t * +xlog_ticket_get(xlog_t *log, + int unit_bytes, + int cnt, + char client, + uint xflags) +{ + xlog_ticket_t *tic; + uint num_headers; + SPLDECL(s); + + alloc: + if (log->l_freelist == NULL) + xlog_state_ticket_alloc(log); /* potentially sleep */ + + s = LOG_LOCK(log); + if (log->l_freelist == NULL) { + LOG_UNLOCK(log, s); + goto alloc; + } + tic = log->l_freelist; + log->l_freelist = tic->t_next; + if (log->l_freelist == NULL) + log->l_tail = NULL; + log->l_ticket_cnt--; + LOG_UNLOCK(log, s); + + /* + * Permanent reservations have up to 'cnt'-1 active log operations + * in the log. A unit in this case is the amount of space for one + * of these log operations. Normal reservations have a cnt of 1 + * and their unit amount is the total amount of space required. + * + * The following lines of code account for non-transaction data + * which occupy space in the on-disk log. + */ + + /* for start-rec */ + unit_bytes += sizeof(xlog_op_header_t); + + /* for padding */ + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + /* log su roundoff */ + unit_bytes += log->l_mp->m_sb.sb_logsunit; + } else { + /* BB roundoff */ + unit_bytes += BBSIZE; + } + + /* for commit-rec */ + unit_bytes += sizeof(xlog_op_header_t); + + /* for LR headers */ + num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + unit_bytes += log->l_iclog_hsize * num_headers; + + tic->t_unit_res = unit_bytes; + tic->t_curr_res = unit_bytes; + tic->t_cnt = cnt; + tic->t_ocnt = cnt; + tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_clientid = client; + tic->t_flags = XLOG_TIC_INITED; + if (xflags & XFS_LOG_PERM_RESERV) + tic->t_flags |= XLOG_TIC_PERM_RESERV; + sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); + + return tic; +} /* xlog_ticket_get */ + + +/****************************************************************************** + * + * Log debug routines + * + ****************************************************************************** + */ +#if defined(DEBUG) && !defined(XLOG_NOLOG) +/* + * Make sure that the destination ptr is within the valid data region of + * one of the iclogs. This uses backup pointers stored in a different + * part of the log in case we trash the log structure. + */ +void +xlog_verify_dest_ptr(xlog_t *log, + __psint_t ptr) +{ + int i; + int good_ptr = 0; + + for (i=0; i < log->l_iclog_bufs; i++) { + if (ptr >= (__psint_t)log->l_iclog_bak[i] && + ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + good_ptr++; + } + if (! good_ptr) + xlog_panic("xlog_verify_dest_ptr: invalid ptr"); +} /* xlog_verify_dest_ptr */ + +STATIC void +xlog_verify_grant_head(xlog_t *log, int equals) +{ + if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { + if (equals) + ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); + else + ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); + } else { + ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); + ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); + } +} /* xlog_verify_grant_head */ + +/* check if it will fit */ +STATIC void +xlog_verify_tail_lsn(xlog_t *log, + xlog_in_core_t *iclog, + xfs_lsn_t tail_lsn) +{ + int blocks; + + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = + log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) + xlog_panic("xlog_verify_tail_lsn: ran out of log space"); + } else { + ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + xlog_panic("xlog_verify_tail_lsn: tail wrapped"); + + blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; + if (blocks < BTOBB(iclog->ic_offset) + 1) + xlog_panic("xlog_verify_tail_lsn: ran out of log space"); + } +} /* xlog_verify_tail_lsn */ + +/* + * Perform a number of checks on the iclog before writing to disk. + * + * 1. Make sure the iclogs are still circular + * 2. Make sure we have a good magic number + * 3. Make sure we don't have magic numbers in the data + * 4. Check fields of each log operation header for: + * A. Valid client identifier + * B. tid ptr value falls in valid ptr space (user space code) + * C. Length in log record header is correct according to the + * individual operation headers within record. + * 5. When a bwrite will occur within 5 blocks of the front of the physical + * log, check the preceding blocks of the physical log to make sure all + * the cycle numbers agree with the current cycle number. + */ +STATIC void +xlog_verify_iclog(xlog_t *log, + xlog_in_core_t *iclog, + int count, + boolean_t syncing) +{ + xlog_op_header_t *ophead; + xlog_in_core_t *icptr; + xlog_in_core_2_t *xhdr; + xfs_caddr_t ptr; + xfs_caddr_t base_ptr; + __psint_t field_offset; + __uint8_t clientid; + int len, i, j, k, op_len; + int idx; + SPLDECL(s); + + /* check validity of iclog pointers */ + s = LOG_LOCK(log); + icptr = log->l_iclog; + for (i=0; i < log->l_iclog_bufs; i++) { + if (icptr == 0) + xlog_panic("xlog_verify_iclog: invalid ptr"); + icptr = icptr->ic_next; + } + if (icptr != log->l_iclog) + xlog_panic("xlog_verify_iclog: corrupt iclog ring"); + LOG_UNLOCK(log, s); + + /* check log magic numbers */ + ptr = (xfs_caddr_t) &(iclog->ic_header); + if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) + xlog_panic("xlog_verify_iclog: invalid magic num"); + + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count; + ptr += BBSIZE) { + if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) + xlog_panic("xlog_verify_iclog: unexpected magic num"); + } + + /* check fields */ + len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT); + ptr = iclog->ic_datap; + base_ptr = ptr; + ophead = (xlog_op_header_t *)ptr; + xhdr = (xlog_in_core_2_t *)&iclog->ic_header; + for (i = 0; i < len; i++) { + ophead = (xlog_op_header_t *)ptr; + + /* clientid is only 1 byte */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + if (syncing == B_FALSE || (field_offset & 0x1ff)) { + clientid = ophead->oh_clientid; + } else { + idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + } else { + clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + } + } + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + cmn_err(CE_WARN, "xlog_verify_iclog: invalid clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset); + + /* check length */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + if (syncing == B_FALSE || (field_offset & 0x1ff)) { + op_len = INT_GET(ophead->oh_len, ARCH_CONVERT); + } else { + idx = BTOBBT((__psint_t)&ophead->oh_len - + (__psint_t)iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + } else { + op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + } + } + ptr += sizeof(xlog_op_header_t) + op_len; + } +} /* xlog_verify_iclog */ +#endif /* DEBUG && !XLOG_NOLOG */ + +/* + * Mark all iclogs IOERROR. LOG_LOCK is held by the caller. + */ +STATIC int +xlog_state_ioerror( + xlog_t *log) +{ + xlog_in_core_t *iclog, *ic; + + iclog = log->l_iclog; + if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Mark all the incore logs IOERROR. + * From now on, no log flushes will result. + */ + ic = iclog; + do { + ic->ic_state = XLOG_STATE_IOERROR; + ic = ic->ic_next; + } while (ic != iclog); + return (0); + } + /* + * Return non-zero, if state transition has already happened. + */ + return (1); +} + +/* + * This is called from xfs_force_shutdown, when we're forcibly + * shutting down the filesystem, typically because of an IO error. + * Our main objectives here are to make sure that: + * a. the filesystem gets marked 'SHUTDOWN' for all interested + * parties to find out, 'atomically'. + * b. those who're sleeping on log reservations, pinned objects and + * other resources get woken up, and be told the bad news. + * c. nothing new gets queued up after (a) and (b) are done. + * d. if !logerror, flush the iclogs to disk, then seal them off + * for business. + */ +int +xfs_log_force_umount( + struct xfs_mount *mp, + int logerror) +{ + xlog_ticket_t *tic; + xlog_t *log; + int retval; + SPLDECL(s); + SPLDECL(s2); + + log = mp->m_log; + + /* + * If this happens during log recovery, don't worry about + * locking; the log isn't open for business yet. + */ + if (!log || + log->l_flags & XLOG_ACTIVE_RECOVERY) { + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + XFS_BUF_DONE(mp->m_sb_bp); + return (0); + } + + /* + * Somebody could've already done the hard work for us. + * No need to get locks for this. + */ + if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + return (1); + } + retval = 0; + /* + * We must hold both the GRANT lock and the LOG lock, + * before we mark the filesystem SHUTDOWN and wake + * everybody up to tell the bad news. + */ + s = GRANT_LOCK(log); + s2 = LOG_LOCK(log); + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + XFS_BUF_DONE(mp->m_sb_bp); + /* + * This flag is sort of redundant because of the mount flag, but + * it's good to maintain the separation between the log and the rest + * of XFS. + */ + log->l_flags |= XLOG_IO_ERROR; + + /* + * If we hit a log error, we want to mark all the iclogs IOERROR + * while we're still holding the loglock. + */ + if (logerror) + retval = xlog_state_ioerror(log); + LOG_UNLOCK(log, s2); + + /* + * We don't want anybody waiting for log reservations + * after this. That means we have to wake up everybody + * queued up on reserve_headq as well as write_headq. + * In addition, we make sure in xlog_{re}grant_log_space + * that we don't enqueue anything once the SHUTDOWN flag + * is set, and this action is protected by the GRANTLOCK. + */ + if ((tic = log->l_reserve_headq)) { + do { + sv_signal(&tic->t_sema); + tic = tic->t_next; + } while (tic != log->l_reserve_headq); + } + + if ((tic = log->l_write_headq)) { + do { + sv_signal(&tic->t_sema); + tic = tic->t_next; + } while (tic != log->l_write_headq); + } + GRANT_UNLOCK(log, s); + + if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(!logerror); + /* + * Force the incore logs to disk before shutting the + * log down completely. + */ + xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC); + s2 = LOG_LOCK(log); + retval = xlog_state_ioerror(log); + LOG_UNLOCK(log, s2); + } + /* + * Wake up everybody waiting on xfs_log_force. + * Callback all log item committed functions as if the + * log writes were completed. + */ + xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + +#ifdef XFSERRORDEBUG + { + xlog_in_core_t *iclog; + + s = LOG_LOCK(log); + iclog = log->l_iclog; + do { + ASSERT(iclog->ic_callback == 0); + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + LOG_UNLOCK(log, s); + } +#endif + /* return non-zero if log IOERROR transition had already happened */ + return (retval); +} + +int +xlog_iclogs_empty(xlog_t *log) +{ + xlog_in_core_t *iclog; + + iclog = log->l_iclog; + do { + /* endianness does not matter here, zero is zero in + * any language. + */ + if (iclog->ic_header.h_num_logops) + return(0); + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + return(1); +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h new file mode 100644 index 00000000000..0db122ddda3 --- /dev/null +++ b/fs/xfs/xfs_log.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LOG_H__ +#define __XFS_LOG_H__ + +/* get lsn fields */ + +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((uint *)&(lsn))[0]) + +#ifdef __KERNEL__ +/* + * By comparing each compnent, we don't have to worry about extra + * endian issues in treating two 32 bit numbers as one 64 bit number + */ +static +#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96)) +__attribute__((unused)) /* gcc 2.95, 2.96 miscompile this when inlined */ +#else +__inline__ +#endif +xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) +{ + if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) + return (CYCLE_LSN(lsn1)l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* + * set lsns + */ + +#define ASSIGN_ANY_LSN_HOST(lsn,cycle,block) \ + { \ + (lsn) = ((xfs_lsn_t)(cycle)<<32)|(block); \ + } +#define ASSIGN_ANY_LSN_DISK(lsn,cycle,block) \ + { \ + INT_SET(((uint *)&(lsn))[0], ARCH_CONVERT, (cycle)); \ + INT_SET(((uint *)&(lsn))[1], ARCH_CONVERT, (block)); \ + } +#define ASSIGN_LSN(lsn,log) \ + ASSIGN_ANY_LSN_DISK(lsn,(log)->l_curr_cycle,(log)->l_curr_block); + +#define XLOG_SET(f,b) (((f) & (b)) == (b)) + +#define GET_CYCLE(ptr, arch) \ + (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \ + INT_GET(*((uint *)(ptr)+1), arch) : \ + INT_GET(*(uint *)(ptr), arch) \ + ) + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + + +#ifdef __KERNEL__ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define GET_CLIENT_ID(i,arch) \ + ((i) & 0xff) +#else +#define GET_CLIENT_ID(i,arch) \ + ((i) >> 24) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_SUB_SPACE) +void xlog_grant_sub_space(struct log *log, int bytes, int type); +#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \ + xlog_grant_sub_space(log,bytes,type) +#else +#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \ + { \ + if (type == 'w') { \ + (log)->l_grant_write_bytes -= (bytes); \ + if ((log)->l_grant_write_bytes < 0) { \ + (log)->l_grant_write_bytes += (log)->l_logsize; \ + (log)->l_grant_write_cycle--; \ + } \ + } else { \ + (log)->l_grant_reserve_bytes -= (bytes); \ + if ((log)->l_grant_reserve_bytes < 0) { \ + (log)->l_grant_reserve_bytes += (log)->l_logsize;\ + (log)->l_grant_reserve_cycle--; \ + } \ + } \ + } +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_ADD_SPACE) +void xlog_grant_add_space(struct log *log, int bytes, int type); +#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \ + xlog_grant_add_space(log,bytes,type) +#else +#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \ + { \ + if (type == 'w') { \ + (log)->l_grant_write_bytes += (bytes); \ + if ((log)->l_grant_write_bytes > (log)->l_logsize) { \ + (log)->l_grant_write_bytes -= (log)->l_logsize; \ + (log)->l_grant_write_cycle++; \ + } \ + } else { \ + (log)->l_grant_reserve_bytes += (bytes); \ + if ((log)->l_grant_reserve_bytes > (log)->l_logsize) { \ + (log)->l_grant_reserve_bytes -= (log)->l_logsize;\ + (log)->l_grant_reserve_cycle++; \ + } \ + } \ + } +#endif +#define XLOG_INS_TICKETQ(q,tic) \ + { \ + if (q) { \ + (tic)->t_next = (q); \ + (tic)->t_prev = (q)->t_prev; \ + (q)->t_prev->t_next = (tic); \ + (q)->t_prev = (tic); \ + } else { \ + (tic)->t_prev = (tic)->t_next = (tic); \ + (q) = (tic); \ + } \ + (tic)->t_flags |= XLOG_TIC_IN_Q; \ + } +#define XLOG_DEL_TICKETQ(q,tic) \ + { \ + if ((tic) == (tic)->t_next) { \ + (q) = NULL; \ + } else { \ + (q) = (tic)->t_next; \ + (tic)->t_next->t_prev = (tic)->t_prev; \ + (tic)->t_prev->t_next = (tic)->t_next; \ + } \ + (tic)->t_next = (tic)->t_prev = NULL; \ + (tic)->t_flags &= ~XLOG_TIC_IN_Q; \ + } + + +#define GRANT_LOCK(log) mutex_spinlock(&(log)->l_grant_lock) +#define GRANT_UNLOCK(log, s) mutex_spinunlock(&(log)->l_grant_lock, s) +#define LOG_LOCK(log) mutex_spinlock(&(log)->l_icloglock) +#define LOG_UNLOCK(log, s) mutex_spinunlock(&(log)->l_icloglock, s) + +#define xlog_panic(args...) cmn_err(CE_PANIC, ## args) +#define xlog_exit(args...) cmn_err(CE_PANIC, ## args) +#define xlog_warn(args...) cmn_err(CE_WARN, ## args) + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ +#endif /* __KERNEL__ */ + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ +#define XLOG_SKIP_TRANS (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \ + XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \ + XLOG_UNMOUNT_TRANS) + +#ifdef __KERNEL__ +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_IN_Q 0x4 +#endif /* __KERNEL__ */ + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* + * Flags for log structure + */ +#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +typedef __uint32_t xlog_tid_t; + + +#ifdef __KERNEL__ +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +typedef struct xlog_ticket { + sv_t t_sema; /* sleep on this semaphore :20 */ + struct xlog_ticket *t_next; /* : 4 */ + struct xlog_ticket *t_prev; /* : 4 */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + __uint8_t t_ocnt; /* original count : 1 */ + __uint8_t t_cnt; /* current count : 1 */ + __uint8_t t_clientid; /* who does this belong to; : 1 */ + __uint8_t t_flags; /* properties of reservation : 1 */ +} xlog_ticket_t; +#endif + + +typedef struct xlog_op_header { + xlog_tid_t oh_tid; /* transaction id of operation : 4 b */ + int oh_len; /* bytes in data region : 4 b */ + __uint8_t oh_clientid; /* who sent me this : 1 b */ + __uint8_t oh_flags; /* : 1 b */ + ushort oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define XLOG_FMT XLOG_FMT_LINUX_LE +#else +#if __BYTE_ORDER == __BIG_ENDIAN +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#error unknown byte order +#endif +#endif + +typedef struct xlog_rec_header { + uint h_magicno; /* log record (LR) identifier : 4 */ + uint h_cycle; /* write cycle of log : 4 */ + int h_version; /* LR version : 4 */ + int h_len; /* len in bytes; should be 64-bit aligned: 4 */ + xfs_lsn_t h_lsn; /* lsn of this LR : 8 */ + xfs_lsn_t h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + uint h_chksum; /* may not be used; non-zero if used : 4 */ + int h_prev_block; /* block number to previous LR : 4 */ + int h_num_logops; /* number of log operations in this LR : 4 */ + uint h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + int h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + int h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + uint xh_cycle; /* write cycle of log : 4 */ + uint xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +#ifdef __KERNEL__ +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcesema is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + */ +typedef struct xlog_iclog_fields { + sv_t ic_forcesema; + sv_t ic_writesema; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct log *ic_log; + xfs_log_callback_t *ic_callback; + xfs_log_callback_t **ic_callback_tail; +#ifdef XFS_LOG_TRACE + struct ktrace *ic_trace; +#endif + int ic_size; + int ic_offset; + int ic_refcnt; + int ic_bwritecnt; + ushort_t ic_state; + char *ic_datap; /* pointer to iclog data */ +} xlog_iclog_fields_t; + +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +typedef struct xlog_in_core { + xlog_iclog_fields_t hic_fields; + xlog_in_core_2_t *hic_data; +} xlog_in_core_t; + +/* + * Defines to save our code from this glop. + */ +#define ic_forcesema hic_fields.ic_forcesema +#define ic_writesema hic_fields.ic_writesema +#define ic_next hic_fields.ic_next +#define ic_prev hic_fields.ic_prev +#define ic_bp hic_fields.ic_bp +#define ic_log hic_fields.ic_log +#define ic_callback hic_fields.ic_callback +#define ic_callback_tail hic_fields.ic_callback_tail +#define ic_trace hic_fields.ic_trace +#define ic_size hic_fields.ic_size +#define ic_offset hic_fields.ic_offset +#define ic_refcnt hic_fields.ic_refcnt +#define ic_bwritecnt hic_fields.ic_bwritecnt +#define ic_state hic_fields.ic_state +#define ic_datap hic_fields.ic_datap +#define ic_header hic_data->hic_header + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +typedef struct log { + /* The following block of fields are changed while holding icloglock */ + sema_t l_flushsema; /* iclog flushing semaphore */ + int l_flushcnt; /* # of procs waiting on this + * sema */ + int l_ticket_cnt; /* free ticket count */ + int l_ticket_tcnt; /* total ticket count */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_ticket_t *l_freelist; /* free list of tickets */ + xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */ + xlog_ticket_t *l_tail; /* free list of tickets */ + xlog_in_core_t *l_iclog; /* head log queue */ + lock_t l_icloglock; /* grab to change iclog state */ + xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed + * buffers */ + xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + + /* The following field are used for debugging; need to hold icloglock */ + char *l_iclog_bak[XLOG_MAX_ICLOGS]; + + /* The following block of fields are changed while holding grant_lock */ + lock_t l_grant_lock; + xlog_ticket_t *l_reserve_headq; + xlog_ticket_t *l_write_headq; + int l_grant_reserve_cycle; + int l_grant_reserve_bytes; + int l_grant_write_cycle; + int l_grant_write_bytes; + + /* The following fields don't need locking */ +#ifdef XFS_LOG_TRACE + struct ktrace *l_trace; + struct ktrace *l_grant_trace; +#endif + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct xfs_buf_cancel **l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectbb_log; /* log2 of sector size in BBs */ + uint l_sectbb_mask; /* sector size (in BBs) + * alignment mask */ +} xlog_t; + + +/* common routines */ +extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +extern int xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk); +extern int xlog_find_tail(xlog_t *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk, + int readonly); +extern int xlog_recover(xlog_t *log, int readonly); +extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); +extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); +extern void xlog_recover_process_iunlinks(xlog_t *log); + +extern struct xfs_buf *xlog_get_bp(xlog_t *, int); +extern void xlog_put_bp(struct xfs_buf *); +extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); +extern xfs_caddr_t xlog_align(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); + +/* iclog tracing */ +#define XLOG_TRACE_GRAB_FLUSH 1 +#define XLOG_TRACE_REL_FLUSH 2 +#define XLOG_TRACE_SLEEP_FLUSH 3 +#define XLOG_TRACE_WAKE_FLUSH 4 + +#endif /* __KERNEL__ */ + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c new file mode 100644 index 00000000000..9824b5bf0ec --- /dev/null +++ b/fs/xfs/xfs_log_recover.c @@ -0,0 +1,4098 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_trans.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_imap.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_alloc_btree.h" +#include "xfs_log_recover.h" +#include "xfs_extfree_item.h" +#include "xfs_trans_priv.h" +#include "xfs_bit.h" +#include "xfs_quota.h" +#include "xfs_rw.h" + +STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); +STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, + xlog_recover_item_t *item); +#if defined(DEBUG) +STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int); +#else +#define xlog_recover_check_summary(log) +#define xlog_recover_check_ail(mp, lip, gen) +#endif + + +/* + * Sector aligned buffer routines for buffer create/read/write/access + */ + +#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ + ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ + ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) +#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) + +xfs_buf_t * +xlog_get_bp( + xlog_t *log, + int num_bblks) +{ + ASSERT(num_bblks > 0); + + if (log->l_sectbb_log) { + if (num_bblks > 1) + num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); + } + return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); +} + +void +xlog_put_bp( + xfs_buf_t *bp) +{ + xfs_buf_free(bp); +} + + +/* + * nbblks should be uint, but oh well. Just want to catch that 32-bit length. + */ +int +xlog_bread( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + int error; + + if (log->l_sectbb_log) { + blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); + nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); + } + + ASSERT(nbblks > 0); + ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(bp); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_READ(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + + xfsbdstrat(log->l_mp, bp); + if ((error = xfs_iowait(bp))) + xfs_ioerror_alert("xlog_bread", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return error; +} + +/* + * Write out the buffer at the given block for the given number of blocks. + * The buffer is kept locked across the write and is returned locked. + * This can only be used for synchronous log writes. + */ +int +xlog_bwrite( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + int error; + + if (log->l_sectbb_log) { + blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); + nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); + } + + ASSERT(nbblks > 0); + ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_BUSY(bp); + XFS_BUF_HOLD(bp); + XFS_BUF_PSEMA(bp, PRIBIO); + XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + + if ((error = xfs_bwrite(log->l_mp, bp))) + xfs_ioerror_alert("xlog_bwrite", log->l_mp, + bp, XFS_BUF_ADDR(bp)); + return error; +} + +xfs_caddr_t +xlog_align( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + xfs_caddr_t ptr; + + if (!log->l_sectbb_log) + return XFS_BUF_PTR(bp); + + ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); + ASSERT(XFS_BUF_SIZE(bp) >= + BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); + return ptr; +} + +#ifdef DEBUG +/* + * dump debug superblock and log record information + */ +STATIC void +xlog_header_check_dump( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + int b; + + printk("%s: SB : uuid = ", __FUNCTION__); + for (b = 0; b < 16; b++) + printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]); + printk(", fmt = %d\n", XLOG_FMT); + printk(" log : uuid = "); + for (b = 0; b < 16; b++) + printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]); + printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT)); +} +#else +#define xlog_header_check_dump(mp, head) +#endif + +/* + * check log record header for recovery + */ +STATIC int +xlog_header_check_recover( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + + /* + * IRIX doesn't write the h_fmt field and leaves it zeroed + * (XLOG_FMT_UNKNOWN). This stops us from trying to recover + * a dirty log created in IRIX. + */ + if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) { + xlog_warn( + "XFS: dirty log written in incompatible format - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(1)", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xlog_warn( + "XFS: dirty log entry has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(2)", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * read the head block of the log and check the header + */ +STATIC int +xlog_header_check_mount( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + + if (uuid_is_nil(&head->h_fs_uuid)) { + /* + * IRIX doesn't write the h_fs_uuid or h_fmt fields. If + * h_fs_uuid is nil, we assume this log was last mounted + * by IRIX and continue. + */ + xlog_warn("XFS: nil uuid in log - IRIX style log"); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xlog_warn("XFS: log has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_mount", + XFS_ERRLEVEL_HIGH, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +STATIC void +xlog_recover_iodone( + struct xfs_buf *bp) +{ + xfs_mount_t *mp; + + ASSERT(XFS_BUF_FSPRIVATE(bp, void *)); + + if (XFS_BUF_GETERROR(bp)) { + /* + * We're not going to bother about retrying + * this during recovery. One strike! + */ + mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *); + xfs_ioerror_alert("xlog_recover_iodone", + mp, bp, XFS_BUF_ADDR(bp)); + xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); + } + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_biodone(bp); +} + +/* + * This routine finds (to an approximation) the first block in the physical + * log which contains the given cycle. It uses a binary search algorithm. + * Note that the algorithm can not be perfect because the disk will not + * necessarily be perfect. + */ +int +xlog_find_cycle_start( + xlog_t *log, + xfs_buf_t *bp, + xfs_daddr_t first_blk, + xfs_daddr_t *last_blk, + uint cycle) +{ + xfs_caddr_t offset; + xfs_daddr_t mid_blk; + uint mid_cycle; + int error; + + mid_blk = BLK_AVG(first_blk, *last_blk); + while (mid_blk != first_blk && mid_blk != *last_blk) { + if ((error = xlog_bread(log, mid_blk, 1, bp))) + return error; + offset = xlog_align(log, mid_blk, 1, bp); + mid_cycle = GET_CYCLE(offset, ARCH_CONVERT); + if (mid_cycle == cycle) { + *last_blk = mid_blk; + /* last_half_cycle == mid_cycle */ + } else { + first_blk = mid_blk; + /* first_half_cycle == mid_cycle */ + } + mid_blk = BLK_AVG(first_blk, *last_blk); + } + ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || + (mid_blk == *last_blk && mid_blk-1 == first_blk)); + + return 0; +} + +/* + * Check that the range of blocks does not contain the cycle number + * given. The scan needs to occur from front to back and the ptr into the + * region must be updated since a later routine will need to perform another + * test. If the region is completely good, we end up returning the same + * last block number. + * + * Set blkno to -1 if we encounter no errors. This is an invalid block number + * since we don't ever expect logs to get this large. + */ +STATIC int +xlog_find_verify_cycle( + xlog_t *log, + xfs_daddr_t start_blk, + int nbblks, + uint stop_on_cycle_no, + xfs_daddr_t *new_blk) +{ + xfs_daddr_t i, j; + uint cycle; + xfs_buf_t *bp; + xfs_daddr_t bufblks; + xfs_caddr_t buf = NULL; + int error = 0; + + bufblks = 1 << ffs(nbblks); + + while (!(bp = xlog_get_bp(log, bufblks))) { + /* can't get enough memory to do everything in one big buffer */ + bufblks >>= 1; + if (bufblks <= log->l_sectbb_log) + return ENOMEM; + } + + for (i = start_blk; i < start_blk + nbblks; i += bufblks) { + int bcount; + + bcount = min(bufblks, (start_blk + nbblks - i)); + + if ((error = xlog_bread(log, i, bcount, bp))) + goto out; + + buf = xlog_align(log, i, bcount, bp); + for (j = 0; j < bcount; j++) { + cycle = GET_CYCLE(buf, ARCH_CONVERT); + if (cycle == stop_on_cycle_no) { + *new_blk = i+j; + goto out; + } + + buf += BBSIZE; + } + } + + *new_blk = -1; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Potentially backup over partial log record write. + * + * In the typical case, last_blk is the number of the block directly after + * a good log record. Therefore, we subtract one to get the block number + * of the last block in the given buffer. extra_bblks contains the number + * of blocks we would have read on a previous read. This happens when the + * last log record is split over the end of the physical log. + * + * extra_bblks is the number of blocks potentially verified on a previous + * call to this routine. + */ +STATIC int +xlog_find_verify_log_record( + xlog_t *log, + xfs_daddr_t start_blk, + xfs_daddr_t *last_blk, + int extra_bblks) +{ + xfs_daddr_t i; + xfs_buf_t *bp; + xfs_caddr_t offset = NULL; + xlog_rec_header_t *head = NULL; + int error = 0; + int smallmem = 0; + int num_blks = *last_blk - start_blk; + int xhdrs; + + ASSERT(start_blk != 0 || *last_blk != start_blk); + + if (!(bp = xlog_get_bp(log, num_blks))) { + if (!(bp = xlog_get_bp(log, 1))) + return ENOMEM; + smallmem = 1; + } else { + if ((error = xlog_bread(log, start_blk, num_blks, bp))) + goto out; + offset = xlog_align(log, start_blk, num_blks, bp); + offset += ((num_blks - 1) << BBSHIFT); + } + + for (i = (*last_blk) - 1; i >= 0; i--) { + if (i < start_blk) { + /* valid log record not found */ + xlog_warn( + "XFS: Log inconsistent (didn't find previous header)"); + ASSERT(0); + error = XFS_ERROR(EIO); + goto out; + } + + if (smallmem) { + if ((error = xlog_bread(log, i, 1, bp))) + goto out; + offset = xlog_align(log, i, 1, bp); + } + + head = (xlog_rec_header_t *)offset; + + if (XLOG_HEADER_MAGIC_NUM == + INT_GET(head->h_magicno, ARCH_CONVERT)) + break; + + if (!smallmem) + offset -= BBSIZE; + } + + /* + * We hit the beginning of the physical log & still no header. Return + * to caller. If caller can handle a return of -1, then this routine + * will be called again for the end of the physical log. + */ + if (i == -1) { + error = -1; + goto out; + } + + /* + * We have the final block of the good log (the first block + * of the log record _before_ the head. So we check the uuid. + */ + if ((error = xlog_header_check_mount(log->l_mp, head))) + goto out; + + /* + * We may have found a log record header before we expected one. + * last_blk will be the 1st block # with a given cycle #. We may end + * up reading an entire log record. In this case, we don't want to + * reset last_blk. Only when last_blk points in the middle of a log + * record do we update last_blk. + */ + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + uint h_size = INT_GET(head->h_size, ARCH_CONVERT); + + xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + } else { + xhdrs = 1; + } + + if (*last_blk - i + extra_bblks + != BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs) + *last_blk = i; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Head is defined to be the point of the log where the next log write + * write could go. This means that incomplete LR writes at the end are + * eliminated when calculating the head. We aren't guaranteed that previous + * LR have complete transactions. We only know that a cycle number of + * current cycle number -1 won't be present in the log if we start writing + * from our current block number. + * + * last_blk contains the block number of the first block with a given + * cycle number. + * + * Return: zero if normal, non-zero if error. + */ +int +xlog_find_head( + xlog_t *log, + xfs_daddr_t *return_head_blk) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; + int num_scan_bblks; + uint first_half_cycle, last_half_cycle; + uint stop_on_cycle; + int error, log_bbnum = log->l_logBBsize; + + /* Is the end of the log device zeroed? */ + if ((error = xlog_find_zeroed(log, &first_blk)) == -1) { + *return_head_blk = first_blk; + + /* Is the whole lot zeroed? */ + if (!first_blk) { + /* Linux XFS shouldn't generate totally zeroed logs - + * mkfs etc write a dummy unmount record to a fresh + * log so we can store the uuid in there + */ + xlog_warn("XFS: totally zeroed log"); + } + + return 0; + } else if (error) { + xlog_warn("XFS: empty log check failed"); + return error; + } + + first_blk = 0; /* get cycle # of 1st block */ + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + if ((error = xlog_bread(log, 0, 1, bp))) + goto bp_err; + offset = xlog_align(log, 0, 1, bp); + first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + + last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ + if ((error = xlog_bread(log, last_blk, 1, bp))) + goto bp_err; + offset = xlog_align(log, last_blk, 1, bp); + last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + ASSERT(last_half_cycle != 0); + + /* + * If the 1st half cycle number is equal to the last half cycle number, + * then the entire log is stamped with the same cycle number. In this + * case, head_blk can't be set to zero (which makes sense). The below + * math doesn't work out properly with head_blk equal to zero. Instead, + * we set it to log_bbnum which is an invalid block number, but this + * value makes the math correct. If head_blk doesn't changed through + * all the tests below, *head_blk is set to zero at the very end rather + * than log_bbnum. In a sense, log_bbnum and zero are the same block + * in a circular file. + */ + if (first_half_cycle == last_half_cycle) { + /* + * In this case we believe that the entire log should have + * cycle number last_half_cycle. We need to scan backwards + * from the end verifying that there are no holes still + * containing last_half_cycle - 1. If we find such a hole, + * then the start of that hole will be the new head. The + * simple case looks like + * x | x ... | x - 1 | x + * Another case that fits this picture would be + * x | x + 1 | x ... | x + * In this case the head really is somwhere at the end of the + * log, as one of the latest writes at the beginning was + * incomplete. + * One more case is + * x | x + 1 | x ... | x - 1 | x + * This is really the combination of the above two cases, and + * the head has to end up at the start of the x-1 hole at the + * end of the log. + * + * In the 256k log case, we will read from the beginning to the + * end of the log and search for cycle numbers equal to x-1. + * We don't worry about the x+1 blocks that we encounter, + * because we know that they cannot be the head since the log + * started with x. + */ + head_blk = log_bbnum; + stop_on_cycle = last_half_cycle - 1; + } else { + /* + * In this case we want to find the first block with cycle + * number matching last_half_cycle. We expect the log to be + * some variation on + * x + 1 ... | x ... + * The first block with cycle number x (last_half_cycle) will + * be where the new head belongs. First we do a binary search + * for the first occurrence of last_half_cycle. The binary + * search may not be totally accurate, so then we scan back + * from there looking for occurrences of last_half_cycle before + * us. If that backwards scan wraps around the beginning of + * the log, then we look for occurrences of last_half_cycle - 1 + * at the end of the log. The cases we're looking for look + * like + * x + 1 ... | x | x + 1 | x ... + * ^ binary search stopped here + * or + * x + 1 ... | x ... | x - 1 | x + * <---------> less than scan distance + */ + stop_on_cycle = last_half_cycle; + if ((error = xlog_find_cycle_start(log, bp, first_blk, + &head_blk, last_half_cycle))) + goto bp_err; + } + + /* + * Now validate the answer. Scan back some number of maximum possible + * blocks and make sure each one has the expected cycle number. The + * maximum is determined by the total possible amount of buffering + * in the in-core log. The following number can be made tighter if + * we actually look at the block size of the filesystem. + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + /* + * We are guaranteed that the entire check can be performed + * in one buffer. + */ + start_blk = head_blk - num_scan_bblks; + if ((error = xlog_find_verify_cycle(log, + start_blk, num_scan_bblks, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } else { /* need to read 2 parts of log */ + /* + * We are going to scan backwards in the log in two parts. + * First we scan the physical end of the log. In this part + * of the log, we are looking for blocks with cycle number + * last_half_cycle - 1. + * If we find one, then we know that the log starts there, as + * we've found a hole that didn't get written in going around + * the end of the physical log. The simple case for this is + * x + 1 ... | x ... | x - 1 | x + * <---------> less than scan distance + * If all of the blocks at the end of the log have cycle number + * last_half_cycle, then we check the blocks at the start of + * the log looking for occurrences of last_half_cycle. If we + * find one, then our current estimate for the location of the + * first occurrence of last_half_cycle is wrong and we move + * back to the hole we've found. This case looks like + * x + 1 ... | x | x + 1 | x ... + * ^ binary search stopped here + * Another case we need to handle that only occurs in 256k + * logs is + * x + 1 ... | x ... | x+1 | x ... + * ^ binary search stops here + * In a 256k log, the scan at the end of the log will see the + * x + 1 blocks. We need to skip past those since that is + * certainly not the head of the log. By searching for + * last_half_cycle-1 we accomplish that. + */ + start_blk = log_bbnum - num_scan_bblks + head_blk; + ASSERT(head_blk <= INT_MAX && + (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + if ((error = xlog_find_verify_cycle(log, start_blk, + num_scan_bblks - (int)head_blk, + (stop_on_cycle - 1), &new_blk))) + goto bp_err; + if (new_blk != -1) { + head_blk = new_blk; + goto bad_blk; + } + + /* + * Scan beginning of log now. The last part of the physical + * log is good. This scan needs to verify that it doesn't find + * the last_half_cycle. + */ + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_cycle(log, + start_blk, (int)head_blk, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } + + bad_blk: + /* + * Now we need to make sure head_blk is not pointing to a block in + * the middle of a log record. + */ + num_scan_bblks = XLOG_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ + + /* start ptr at last block ptr before head_blk */ + if ((error = xlog_find_verify_log_record(log, start_blk, + &head_blk, 0)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + } else { + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_log_record(log, start_blk, + &head_blk, 0)) == -1) { + /* We hit the beginning of the log during our search */ + start_blk = log_bbnum - num_scan_bblks + head_blk; + new_blk = log_bbnum; + ASSERT(start_blk <= INT_MAX && + (xfs_daddr_t) log_bbnum-start_blk >= 0); + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_log_record(log, + start_blk, &new_blk, + (int)head_blk)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + if (new_blk != log_bbnum) + head_blk = new_blk; + } else if (error) + goto bp_err; + } + + xlog_put_bp(bp); + if (head_blk == log_bbnum) + *return_head_blk = 0; + else + *return_head_blk = head_blk; + /* + * When returning here, we have a good block number. Bad block + * means that during a previous crash, we didn't have a clean break + * from cycle number N to cycle number N-1. In this case, we need + * to find the first block with cycle number N-1. + */ + return 0; + + bp_err: + xlog_put_bp(bp); + + if (error) + xlog_warn("XFS: failed to find log head"); + return error; +} + +/* + * Find the sync block number or the tail of the log. + * + * This will be the block number of the last record to have its + * associated buffers synced to disk. Every log record header has + * a sync lsn embedded in it. LSNs hold block numbers, so it is easy + * to get a sync block number. The only concern is to figure out which + * log record header to believe. + * + * The following algorithm uses the log record header with the largest + * lsn. The entire log record does not need to be valid. We only care + * that the header is valid. + * + * We could speed up search by using current head_blk buffer, but it is not + * available. + */ +int +xlog_find_tail( + xlog_t *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk, + int readonly) +{ + xlog_rec_header_t *rhead; + xlog_op_header_t *op_head; + xfs_caddr_t offset = NULL; + xfs_buf_t *bp; + int error, i, found; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + xfs_lsn_t tail_lsn; + int hblks; + + found = 0; + + /* + * Find previous log record + */ + if ((error = xlog_find_head(log, head_blk))) + return error; + + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + if (*head_blk == 0) { /* special case */ + if ((error = xlog_bread(log, 0, 1, bp))) + goto bread_err; + offset = xlog_align(log, 0, 1, bp); + if (GET_CYCLE(offset, ARCH_CONVERT) == 0) { + *tail_blk = 0; + /* leave all other log inited values alone */ + goto exit; + } + } + + /* + * Search backwards looking for log record header block + */ + ASSERT(*head_blk < INT_MAX); + for (i = (int)(*head_blk) - 1; i >= 0; i--) { + if ((error = xlog_bread(log, i, 1, bp))) + goto bread_err; + offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == + INT_GET(*(uint *)offset, ARCH_CONVERT)) { + found = 1; + break; + } + } + /* + * If we haven't found the log record header block, start looking + * again from the end of the physical log. XXXmiken: There should be + * a check here to make sure we didn't search more than N blocks in + * the previous code. + */ + if (!found) { + for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { + if ((error = xlog_bread(log, i, 1, bp))) + goto bread_err; + offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == + INT_GET(*(uint*)offset, ARCH_CONVERT)) { + found = 2; + break; + } + } + } + if (!found) { + xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); + ASSERT(0); + return XFS_ERROR(EIO); + } + + /* find blk_no of tail of log */ + rhead = (xlog_rec_header_t *)offset; + *tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT)); + + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = i; + log->l_curr_block = (int)*head_blk; + log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT); + if (found == 2) + log->l_curr_cycle++; + log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT); + log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT); + log->l_grant_reserve_cycle = log->l_curr_cycle; + log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); + log->l_grant_write_cycle = log->l_curr_cycle; + log->l_grant_write_bytes = BBTOB(log->l_curr_block); + + /* + * Look for unmount record. If we find it, then we know there + * was a clean unmount. Since 'i' could be the last block in + * the physical log, we convert to a log block before comparing + * to the head_blk. + * + * Save the current tail lsn to use to pass to + * xlog_clear_stale_blocks() below. We won't want to clear the + * unmount record if there is one, so we pass the lsn of the + * unmount record rather than the block after it. + */ + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + int h_size = INT_GET(rhead->h_size, ARCH_CONVERT); + int h_version = INT_GET(rhead->h_version, ARCH_CONVERT); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = (i + hblks + (int) + BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize; + tail_lsn = log->l_tail_lsn; + if (*head_blk == after_umount_blk && + INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) { + umount_data_blk = (i + hblks) % log->l_logBBsize; + if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { + goto bread_err; + } + offset = xlog_align(log, umount_data_blk, 1, bp); + op_head = (xlog_op_header_t *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written + * log records will point recovery to after the + * current unmount record. + */ + ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle, + after_umount_blk); + ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle, + after_umount_blk); + *tail_blk = after_umount_blk; + } + } + + /* + * Make sure that there are no blocks in front of the head + * with the same cycle number as the head. This can happen + * because we allow multiple outstanding log writes concurrently, + * and the later writes might make it out before earlier ones. + * + * We use the lsn from before modifying it so that we'll never + * overwrite the unmount record after a clean unmount. + * + * Do this only if we are going to recover the filesystem + * + * NOTE: This used to say "if (!readonly)" + * However on Linux, we can & do recover a read-only filesystem. + * We only skip recovery if NORECOVERY is specified on mount, + * in which case we would not be here. + * + * But... if the -device- itself is readonly, just skip this. + * We can't recover this device anyway, so it won't matter. + */ + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + error = xlog_clear_stale_blocks(log, tail_lsn); + } + +bread_err: +exit: + xlog_put_bp(bp); + + if (error) + xlog_warn("XFS: failed to locate log tail"); + return error; +} + +/* + * Is the log zeroed at all? + * + * The last binary search should be changed to perform an X block read + * once X becomes small enough. You can then search linearly through + * the X blocks. This will cut down on the number of reads we need to do. + * + * If the log is partially zeroed, this routine will pass back the blkno + * of the first block with cycle number 0. It won't have a complete LR + * preceding it. + * + * Return: + * 0 => the log is completely written to + * -1 => use *blk_no as the first block of the log + * >0 => error has occurred + */ +int +xlog_find_zeroed( + xlog_t *log, + xfs_daddr_t *blk_no) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + uint first_cycle, last_cycle; + xfs_daddr_t new_blk, last_blk, start_blk; + xfs_daddr_t num_scan_bblks; + int error, log_bbnum = log->l_logBBsize; + + /* check totally zeroed log */ + bp = xlog_get_bp(log, 1); + if (!bp) + return ENOMEM; + if ((error = xlog_bread(log, 0, 1, bp))) + goto bp_err; + offset = xlog_align(log, 0, 1, bp); + first_cycle = GET_CYCLE(offset, ARCH_CONVERT); + if (first_cycle == 0) { /* completely zeroed log */ + *blk_no = 0; + xlog_put_bp(bp); + return -1; + } + + /* check partially zeroed log */ + if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) + goto bp_err; + offset = xlog_align(log, log_bbnum-1, 1, bp); + last_cycle = GET_CYCLE(offset, ARCH_CONVERT); + if (last_cycle != 0) { /* log completely written to */ + xlog_put_bp(bp); + return 0; + } else if (first_cycle != 1) { + /* + * If the cycle of the last block is zero, the cycle of + * the first block must be 1. If it's not, maybe we're + * not looking at a log... Bail out. + */ + xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); + return XFS_ERROR(EINVAL); + } + + /* we have a partially zeroed log */ + last_blk = log_bbnum-1; + if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) + goto bp_err; + + /* + * Validate the answer. Because there is no way to guarantee that + * the entire log is made up of log records which are the same size, + * we scan over the defined maximum blocks. At this point, the maximum + * is not chosen to mean anything special. XXXmiken + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + ASSERT(num_scan_bblks <= INT_MAX); + + if (last_blk < num_scan_bblks) + num_scan_bblks = last_blk; + start_blk = last_blk - num_scan_bblks; + + /* + * We search for any instances of cycle number 0 that occur before + * our current estimate of the head. What we're trying to detect is + * 1 ... | 0 | 1 | 0... + * ^ binary search ends here + */ + if ((error = xlog_find_verify_cycle(log, start_blk, + (int)num_scan_bblks, 0, &new_blk))) + goto bp_err; + if (new_blk != -1) + last_blk = new_blk; + + /* + * Potentially backup over partial log record write. We don't need + * to search the end of the log because we know it is zero. + */ + if ((error = xlog_find_verify_log_record(log, start_blk, + &last_blk, 0)) == -1) { + error = XFS_ERROR(EIO); + goto bp_err; + } else if (error) + goto bp_err; + + *blk_no = last_blk; +bp_err: + xlog_put_bp(bp); + if (error) + return error; + return -1; +} + +/* + * These are simple subroutines used by xlog_clear_stale_blocks() below + * to initialize a buffer full of empty log record headers and write + * them into the log. + */ +STATIC void +xlog_add_record( + xlog_t *log, + xfs_caddr_t buf, + int cycle, + int block, + int tail_cycle, + int tail_block) +{ + xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + + memset(buf, 0, BBSIZE); + INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); + INT_SET(recp->h_cycle, ARCH_CONVERT, cycle); + INT_SET(recp->h_version, ARCH_CONVERT, + XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); + ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block); + ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block); + INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT); + memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); +} + +STATIC int +xlog_write_log_records( + xlog_t *log, + int cycle, + int start_block, + int blocks, + int tail_cycle, + int tail_block) +{ + xfs_caddr_t offset; + xfs_buf_t *bp; + int balign, ealign; + int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int end_block = start_block + blocks; + int bufblks; + int error = 0; + int i, j = 0; + + bufblks = 1 << ffs(blocks); + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks <= log->l_sectbb_log) + return ENOMEM; + } + + /* We may need to do a read at the start to fill in part of + * the buffer in the starting sector not covered by the first + * write below. + */ + balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + if (balign != start_block) { + if ((error = xlog_bread(log, start_block, 1, bp))) { + xlog_put_bp(bp); + return error; + } + j = start_block - balign; + } + + for (i = start_block; i < end_block; i += bufblks) { + int bcount, endcount; + + bcount = min(bufblks, end_block - start_block); + endcount = bcount - j; + + /* We may need to do a read at the end to fill in part of + * the buffer in the final sector not covered by the write. + * If this is the same sector as the above read, skip it. + */ + ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + if (j == 0 && (start_block + endcount > ealign)) { + offset = XFS_BUF_PTR(bp); + balign = BBTOB(ealign - start_block); + XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); + if ((error = xlog_bread(log, ealign, sectbb, bp))) + break; + XFS_BUF_SET_PTR(bp, offset, bufblks); + } + + offset = xlog_align(log, start_block, endcount, bp); + for (; j < endcount; j++) { + xlog_add_record(log, offset, cycle, i+j, + tail_cycle, tail_block); + offset += BBSIZE; + } + error = xlog_bwrite(log, start_block, endcount, bp); + if (error) + break; + start_block += endcount; + j = 0; + } + xlog_put_bp(bp); + return error; +} + +/* + * This routine is called to blow away any incomplete log writes out + * in front of the log head. We do this so that we won't become confused + * if we come up, write only a little bit more, and then crash again. + * If we leave the partial log records out there, this situation could + * cause us to think those partial writes are valid blocks since they + * have the current cycle number. We get rid of them by overwriting them + * with empty log records with the old cycle number rather than the + * current one. + * + * The tail lsn is passed in rather than taken from + * the log so that we will not write over the unmount record after a + * clean unmount in a 512 block log. Doing so would leave the log without + * any valid log records in it until a new one was written. If we crashed + * during that time we would not be able to recover. + */ +STATIC int +xlog_clear_stale_blocks( + xlog_t *log, + xfs_lsn_t tail_lsn) +{ + int tail_cycle, head_cycle; + int tail_block, head_block; + int tail_distance, max_distance; + int distance; + int error; + + tail_cycle = CYCLE_LSN(tail_lsn); + tail_block = BLOCK_LSN(tail_lsn); + head_cycle = log->l_curr_cycle; + head_block = log->l_curr_block; + + /* + * Figure out the distance between the new head of the log + * and the tail. We want to write over any blocks beyond the + * head that we may have written just before the crash, but + * we don't want to overwrite the tail of the log. + */ + if (head_cycle == tail_cycle) { + /* + * The tail is behind the head in the physical log, + * so the distance from the head to the tail is the + * distance from the head to the end of the log plus + * the distance from the beginning of the log to the + * tail. + */ + if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { + XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + tail_distance = tail_block + (log->l_logBBsize - head_block); + } else { + /* + * The head is behind the tail in the physical log, + * so the distance from the head to the tail is just + * the tail block minus the head block. + */ + if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ + XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + tail_distance = tail_block - head_block; + } + + /* + * If the head is right up against the tail, we can't clear + * anything. + */ + if (tail_distance <= 0) { + ASSERT(tail_distance == 0); + return 0; + } + + max_distance = XLOG_TOTAL_REC_SHIFT(log); + /* + * Take the smaller of the maximum amount of outstanding I/O + * we could have and the distance to the tail to clear out. + * We take the smaller so that we don't overwrite the tail and + * we don't waste all day writing from the head to the tail + * for no reason. + */ + max_distance = MIN(max_distance, tail_distance); + + if ((head_block + max_distance) <= log->l_logBBsize) { + /* + * We can stomp all the blocks we need to without + * wrapping around the end of the log. Just do it + * in a single write. Use the cycle number of the + * current cycle minus one so that the log will look like: + * n ... | n - 1 ... + */ + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, max_distance, tail_cycle, + tail_block); + if (error) + return error; + } else { + /* + * We need to wrap around the end of the physical log in + * order to clear all the blocks. Do it in two separate + * I/Os. The first write should be from the head to the + * end of the physical log, and it should use the current + * cycle number minus one just like above. + */ + distance = log->l_logBBsize - head_block; + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, distance, tail_cycle, + tail_block); + + if (error) + return error; + + /* + * Now write the blocks at the start of the physical log. + * This writes the remainder of the blocks we want to clear. + * It uses the current cycle number since we're now on the + * same cycle as the head so that we get: + * n ... n ... | n - 1 ... + * ^^^^^ blocks we're writing + */ + distance = max_distance - (log->l_logBBsize - head_block); + error = xlog_write_log_records(log, head_cycle, 0, distance, + tail_cycle, tail_block); + if (error) + return error; + } + + return 0; +} + +/****************************************************************************** + * + * Log recover routines + * + ****************************************************************************** + */ + +STATIC xlog_recover_t * +xlog_recover_find_tid( + xlog_recover_t *q, + xlog_tid_t tid) +{ + xlog_recover_t *p = q; + + while (p != NULL) { + if (p->r_log_tid == tid) + break; + p = p->r_next; + } + return p; +} + +STATIC void +xlog_recover_put_hashq( + xlog_recover_t **q, + xlog_recover_t *trans) +{ + trans->r_next = *q; + *q = trans; +} + +STATIC void +xlog_recover_add_item( + xlog_recover_item_t **itemq) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + xlog_recover_insert_item_backq(itemq, item); +} + +STATIC int +xlog_recover_add_to_cont_trans( + xlog_recover_t *trans, + xfs_caddr_t dp, + int len) +{ + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + item = trans->r_itemq; + if (item == 0) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); /* d, s, l */ + return 0; + } + item = item->ri_prev; + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0); + memcpy(&ptr[old_len], dp, len); /* d, s, l */ + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + xlog_recover_t *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + item = trans->r_itemq; + if (item == 0) { + ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC); + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); /* d, s, l */ + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + if (item->ri_prev->ri_total != 0 && + item->ri_prev->ri_total == item->ri_prev->ri_cnt) { + xlog_recover_add_item(&trans->r_itemq); + } + item = trans->r_itemq; + item = item->ri_prev; + + if (item->ri_total == 0) { /* first region to be added */ + item->ri_total = in_f->ilf_size; + ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); + item->ri_buf = kmem_zalloc((item->ri_total * + sizeof(xfs_log_iovec_t)), KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + return 0; +} + +STATIC void +xlog_recover_new_tid( + xlog_recover_t **q, + xlog_tid_t tid, + xfs_lsn_t lsn) +{ + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + xlog_recover_put_hashq(q, trans); +} + +STATIC int +xlog_recover_unlink_tid( + xlog_recover_t **q, + xlog_recover_t *trans) +{ + xlog_recover_t *tp; + int found = 0; + + ASSERT(trans != 0); + if (trans == *q) { + *q = (*q)->r_next; + } else { + tp = *q; + while (tp != 0) { + if (tp->r_next == trans) { + found = 1; + break; + } + tp = tp->r_next; + } + if (!found) { + xlog_warn( + "XFS: xlog_recover_unlink_tid: trans not found"); + ASSERT(0); + return XFS_ERROR(EIO); + } + tp->r_next = tp->r_next->r_next; + } + return 0; +} + +STATIC void +xlog_recover_insert_item_backq( + xlog_recover_item_t **q, + xlog_recover_item_t *item) +{ + if (*q == 0) { + item->ri_prev = item->ri_next = item; + *q = item; + } else { + item->ri_next = *q; + item->ri_prev = (*q)->ri_prev; + (*q)->ri_prev = item; + item->ri_prev->ri_next = item; + } +} + +STATIC void +xlog_recover_insert_item_frontq( + xlog_recover_item_t **q, + xlog_recover_item_t *item) +{ + xlog_recover_insert_item_backq(q, item); + *q = item; +} + +STATIC int +xlog_recover_reorder_trans( + xlog_t *log, + xlog_recover_t *trans) +{ + xlog_recover_item_t *first_item, *itemq, *itemq_next; + xfs_buf_log_format_t *buf_f; + xfs_buf_log_format_v1_t *obuf_f; + ushort flags = 0; + + first_item = itemq = trans->r_itemq; + trans->r_itemq = NULL; + do { + itemq_next = itemq->ri_next; + buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; + switch (ITEM_TYPE(itemq)) { + case XFS_LI_BUF: + flags = buf_f->blf_flags; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + flags = obuf_f->blf_flags; + break; + } + + switch (ITEM_TYPE(itemq)) { + case XFS_LI_BUF: + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + if (!(flags & XFS_BLI_CANCEL)) { + xlog_recover_insert_item_frontq(&trans->r_itemq, + itemq); + break; + } + case XFS_LI_INODE: + case XFS_LI_6_1_INODE: + case XFS_LI_5_3_INODE: + case XFS_LI_DQUOT: + case XFS_LI_QUOTAOFF: + case XFS_LI_EFD: + case XFS_LI_EFI: + xlog_recover_insert_item_backq(&trans->r_itemq, itemq); + break; + default: + xlog_warn( + "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); + ASSERT(0); + return XFS_ERROR(EIO); + } + itemq = itemq_next; + } while (first_item != itemq); + return 0; +} + +/* + * Build up the table of buf cancel records so that we don't replay + * cancelled data in the second pass. For buffer records that are + * not cancel records, there is nothing to do here so we just return. + * + * If we get a cancel record which is already in the table, this indicates + * that the buffer was cancelled multiple times. In order to ensure + * that during pass 2 we keep the record in the table until we reach its + * last occurrence in the log, we keep a reference count in the cancel + * record in the table to tell us how many times we expect to see this + * record during the second pass. + */ +STATIC void +xlog_recover_do_buffer_pass1( + xlog_t *log, + xfs_buf_log_format_t *buf_f) +{ + xfs_buf_cancel_t *bcp; + xfs_buf_cancel_t *nextp; + xfs_buf_cancel_t *prevp; + xfs_buf_cancel_t **bucket; + xfs_buf_log_format_v1_t *obuf_f; + xfs_daddr_t blkno = 0; + uint len = 0; + ushort flags = 0; + + switch (buf_f->blf_type) { + case XFS_LI_BUF: + blkno = buf_f->blf_blkno; + len = buf_f->blf_len; + flags = buf_f->blf_flags; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + blkno = (xfs_daddr_t) obuf_f->blf_blkno; + len = obuf_f->blf_len; + flags = obuf_f->blf_flags; + break; + } + + /* + * If this isn't a cancel buffer item, then just return. + */ + if (!(flags & XFS_BLI_CANCEL)) + return; + + /* + * Insert an xfs_buf_cancel record into the hash table of + * them. If there is already an identical record, bump + * its reference count. + */ + bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % + XLOG_BC_TABLE_SIZE]; + /* + * If the hash bucket is empty then just insert a new record into + * the bucket. + */ + if (*bucket == NULL) { + bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), + KM_SLEEP); + bcp->bc_blkno = blkno; + bcp->bc_len = len; + bcp->bc_refcount = 1; + bcp->bc_next = NULL; + *bucket = bcp; + return; + } + + /* + * The hash bucket is not empty, so search for duplicates of our + * record. If we find one them just bump its refcount. If not + * then add us at the end of the list. + */ + prevp = NULL; + nextp = *bucket; + while (nextp != NULL) { + if (nextp->bc_blkno == blkno && nextp->bc_len == len) { + nextp->bc_refcount++; + return; + } + prevp = nextp; + nextp = nextp->bc_next; + } + ASSERT(prevp != NULL); + bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), + KM_SLEEP); + bcp->bc_blkno = blkno; + bcp->bc_len = len; + bcp->bc_refcount = 1; + bcp->bc_next = NULL; + prevp->bc_next = bcp; +} + +/* + * Check to see whether the buffer being recovered has a corresponding + * entry in the buffer cancel record table. If it does then return 1 + * so that it will be cancelled, otherwise return 0. If the buffer is + * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * the refcount on the entry in the table and remove it from the table + * if this is the last reference. + * + * We remove the cancel record from the table when we encounter its + * last occurrence in the log so that if the same buffer is re-used + * again after its last cancellation we actually replay the changes + * made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + xlog_t *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + xfs_buf_cancel_t *bcp; + xfs_buf_cancel_t *prevp; + xfs_buf_cancel_t **bucket; + + if (log->l_buf_cancel_table == NULL) { + /* + * There is nothing in the table built in pass one, + * so this buffer must not be cancelled. + */ + ASSERT(!(flags & XFS_BLI_CANCEL)); + return 0; + } + + bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % + XLOG_BC_TABLE_SIZE]; + bcp = *bucket; + if (bcp == NULL) { + /* + * There is no corresponding entry in the table built + * in pass one, so this buffer has not been cancelled. + */ + ASSERT(!(flags & XFS_BLI_CANCEL)); + return 0; + } + + /* + * Search for an entry in the buffer cancel table that + * matches our buffer. + */ + prevp = NULL; + while (bcp != NULL) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) { + /* + * We've go a match, so return 1 so that the + * recovery of this buffer is cancelled. + * If this buffer is actually a buffer cancel + * log item, then decrement the refcount on the + * one in the table and remove it if this is the + * last reference. + */ + if (flags & XFS_BLI_CANCEL) { + bcp->bc_refcount--; + if (bcp->bc_refcount == 0) { + if (prevp == NULL) { + *bucket = bcp->bc_next; + } else { + prevp->bc_next = bcp->bc_next; + } + kmem_free(bcp, + sizeof(xfs_buf_cancel_t)); + } + } + return 1; + } + prevp = bcp; + bcp = bcp->bc_next; + } + /* + * We didn't find a corresponding entry in the table, so + * return 0 so that the buffer is NOT cancelled. + */ + ASSERT(!(flags & XFS_BLI_CANCEL)); + return 0; +} + +STATIC int +xlog_recover_do_buffer_pass2( + xlog_t *log, + xfs_buf_log_format_t *buf_f) +{ + xfs_buf_log_format_v1_t *obuf_f; + xfs_daddr_t blkno = 0; + ushort flags = 0; + uint len = 0; + + switch (buf_f->blf_type) { + case XFS_LI_BUF: + blkno = buf_f->blf_blkno; + flags = buf_f->blf_flags; + len = buf_f->blf_len; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + blkno = (xfs_daddr_t) obuf_f->blf_blkno; + flags = obuf_f->blf_flags; + len = (xfs_daddr_t) obuf_f->blf_len; + break; + } + + return xlog_check_buffer_cancelled(log, blkno, len, flags); +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, + * the only data which should be recovered is that which corresponds + * to the di_next_unlinked pointers in the on disk inode structures. + * The rest of the data for the inodes is always logged through the + * inodes themselves rather than the inode buffer and is recovered + * in xlog_recover_do_inode_trans(). + * + * The only time when buffers full of inodes are fully recovered is + * when the buffer is full of newly allocated inodes. In this case + * the buffer will not be marked as an inode buffer and so will be + * sent to xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + xfs_mount_t *mp, + xlog_recover_item_t *item, + xfs_buf_t *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int item_index; + int bit; + int nbits; + int reg_buf_offset; + int reg_buf_bytes; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + xfs_buf_log_format_v1_t *obuf_f; + unsigned int *data_map = NULL; + unsigned int map_size = 0; + + switch (buf_f->blf_type) { + case XFS_LI_BUF: + data_map = buf_f->blf_data_map; + map_size = buf_f->blf_map_size; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + data_map = obuf_f->blf_data_map; + map_size = obuf_f->blf_map_size; + break; + } + /* + * Set the variables corresponding to the current region to + * 0 so that we'll initialize them on the first pass through + * the loop. + */ + reg_buf_offset = 0; + reg_buf_bytes = 0; + bit = 0; + nbits = 0; + item_index = 0; + inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(data_map, map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) { + return 0; + } + + nbits = xfs_contig_bits(data_map, map_size, + bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLI_SHIFT; + reg_buf_bytes = nbits << XFS_BLI_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) { + continue; + } + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = (xfs_agino_t *) + ((char *)(item->ri_buf[item_index].i_addr) + + (next_unlinked_offset - reg_buf_offset)); + if (unlikely(*logged_nextp == 0)) { + xfs_fs_cmn_err(CE_ALERT, mp, + "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", + item, bp); + XFS_ERROR_REPORT("xlog_recover_do_inode_buf", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, + next_unlinked_offset); + INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp); + } + + return 0; +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +/*ARGSUSED*/ +STATIC void +xlog_recover_do_reg_buffer( + xfs_mount_t *mp, + xlog_recover_item_t *item, + xfs_buf_t *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int bit; + int nbits; + xfs_buf_log_format_v1_t *obuf_f; + unsigned int *data_map = NULL; + unsigned int map_size = 0; + int error; + + switch (buf_f->blf_type) { + case XFS_LI_BUF: + data_map = buf_f->blf_data_map; + map_size = buf_f->blf_map_size; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + data_map = obuf_f->blf_data_map; + map_size = obuf_f->blf_map_size; + break; + } + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(data_map, map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(data_map, map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(XFS_BUF_COUNT(bp) >= + ((uint)bit << XFS_BLI_SHIFT)+(nbits<blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + error = xfs_qm_dqcheck((xfs_disk_dquot_t *) + item->ri_buf[i].i_addr, + -1, 0, XFS_QMOPT_DOWARN, + "dquot_buf_recover"); + } + if (!error) + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLI_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<ri_total); +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_qm_dqcheck( + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (INT_GET(ddq->d_magic, ARCH_CONVERT) != XFS_DQUOT_MAGIC) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, + INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_MAGIC); + errs++; + } + if (INT_GET(ddq->d_version, ARCH_CONVERT) != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, + INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_VERSION); + errs++; + } + + if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER && + INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, INT_GET(ddq->d_flags, ARCH_CONVERT)); + errs++; + } + + if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT)); + errs++; + } + + if (!errs && ddq->d_id) { + if (INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT) && + INT_GET(ddq->d_bcount, ARCH_CONVERT) >= + INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : Dquot ID 0x%x (0x%p) " + "BLK TIMER NOT STARTED", + str, (int) + INT_GET(ddq->d_id, ARCH_CONVERT), ddq); + errs++; + } + } + if (INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT) && + INT_GET(ddq->d_icount, ARCH_CONVERT) >= + INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : Dquot ID 0x%x (0x%p) " + "INODE TIMER NOT STARTED", + str, (int) + INT_GET(ddq->d_id, ARCH_CONVERT), ddq); + errs++; + } + } + if (INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT) && + INT_GET(ddq->d_rtbcount, ARCH_CONVERT) >= + INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_ALERT, + "%s : Dquot ID 0x%x (0x%p) " + "RTBLK TIMER NOT STARTED", + str, (int) + INT_GET(ddq->d_id, ARCH_CONVERT), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC); + INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION); + INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id); + INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type); + + return errs; +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + */ +STATIC void +xlog_recover_do_dquot_buffer( + xfs_mount_t *mp, + xlog_t *log, + xlog_recover_item_t *item, + xfs_buf_t *bp, + xfs_buf_log_format_t *buf_f) +{ + uint type; + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) { + return; + } + + type = 0; + if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_do_buffer_trans( + xlog_t *log, + xlog_recover_item_t *item, + int pass) +{ + xfs_buf_log_format_t *buf_f; + xfs_buf_log_format_v1_t *obuf_f; + xfs_mount_t *mp; + xfs_buf_t *bp; + int error; + int cancel; + xfs_daddr_t blkno; + int len; + ushort flags; + + buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; + + if (pass == XLOG_RECOVER_PASS1) { + /* + * In this pass we're only looking for buf items + * with the XFS_BLI_CANCEL bit set. + */ + xlog_recover_do_buffer_pass1(log, buf_f); + return 0; + } else { + /* + * In this pass we want to recover all the buffers + * which have not been cancelled and are not + * cancellation buffers themselves. The routine + * we call here will tell us whether or not to + * continue with the replay of this buffer. + */ + cancel = xlog_recover_do_buffer_pass2(log, buf_f); + if (cancel) { + return 0; + } + } + switch (buf_f->blf_type) { + case XFS_LI_BUF: + blkno = buf_f->blf_blkno; + len = buf_f->blf_len; + flags = buf_f->blf_flags; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + blkno = obuf_f->blf_blkno; + len = obuf_f->blf_len; + flags = obuf_f->blf_flags; + break; + default: + xfs_fs_cmn_err(CE_ALERT, log->l_mp, + "xfs_log_recover: unknown buffer type 0x%x, dev %s", + buf_f->blf_type, XFS_BUFTARG_NAME(log->l_targ)); + XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + + mp = log->l_mp; + if (flags & XFS_BLI_INODE_BUF) { + bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, + XFS_BUF_LOCK); + } else { + bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); + } + if (XFS_BUF_ISERROR(bp)) { + xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, + bp, blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + return error; + } + + error = 0; + if (flags & XFS_BLI_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + } else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) { + xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + } + if (error) + return XFS_ERROR(error); + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) + * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) && + (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + XFS_BUF_STALE(bp); + error = xfs_bwrite(mp, bp); + } else { + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || + XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); + XFS_BUF_SET_FSPRIVATE(bp, mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); + } + + return (error); +} + +STATIC int +xlog_recover_do_inode_trans( + xlog_t *log, + xlog_recover_item_t *item, + int pass) +{ + xfs_inode_log_format_t *in_f; + xfs_mount_t *mp; + xfs_buf_t *bp; + xfs_imap_t imap; + xfs_dinode_t *dip; + xfs_ino_t ino; + int len; + xfs_caddr_t src; + xfs_caddr_t dest; + int error; + int attr_index; + uint fields; + xfs_dinode_core_t *dicp; + + if (pass == XLOG_RECOVER_PASS1) { + return 0; + } + + in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; + ino = in_f->ilf_ino; + mp = log->l_mp; + if (ITEM_TYPE(item) == XFS_LI_INODE) { + imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno; + imap.im_len = in_f->ilf_len; + imap.im_boffset = in_f->ilf_boffset; + } else { + /* + * It's an old inode format record. We don't know where + * its cluster is located on disk, and we can't allow + * xfs_imap() to figure it out because the inode btrees + * are not ready to be used. Therefore do not pass the + * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give + * us only the single block in which the inode lives + * rather than its cluster, so we must make sure to + * invalidate the buffer when we write it out below. + */ + imap.im_blkno = 0; + xfs_imap(log->l_mp, NULL, ino, &imap, 0); + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) + return 0; + + bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len, + XFS_BUF_LOCK); + if (XFS_BUF_ISERROR(bp)) { + xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, + bp, imap.im_blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + return error; + } + error = 0; + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) { + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", + dip, bp, ino); + XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr); + if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", + item, ino); + XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* Skip replay when the on disk inode is newer than the log one */ + if (dicp->di_flushiter < + INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT) + == DI_MAX_FLUSH) && + (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) { + /* do nothing */ + } else { + xfs_buf_relse(bp); + return 0; + } + } + /* Take the opportunity to reset the flush iteration count */ + dicp->di_flushiter = 0; + + if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + item, dip, bp, ino); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE) && + (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + item, dip, bp, ino); + return XFS_ERROR(EFSCORRUPTED); + } + } + if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + item, dip, bp, ino, + dicp->di_nextents + dicp->di_anextents, + dicp->di_nblocks); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", + item, dip, bp, ino, dicp->di_forkoff); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) { + XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_buf_relse(bp); + xfs_fs_cmn_err(CE_ALERT, mp, + "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", + item->ri_buf[1].i_len, item); + return XFS_ERROR(EFSCORRUPTED); + } + + /* The core is in in-core format */ + xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, + (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { + memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t), + item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t), + item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t)); + } + + fields = in_f->ilf_fields; + switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { + case XFS_ILOG_DEV: + INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev); + + break; + case XFS_ILOG_UUID: + dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid; + break; + } + + if (in_f->ilf_size == 2) + goto write_inode_buffer; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(&dip->di_u, src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, + &(dip->di_u.di_bmbt), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, + (xfs_bmdr_block_t*)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + ASSERT(0); + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + } + +write_inode_buffer: + if (ITEM_TYPE(item) == XFS_LI_INODE) { + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || + XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); + XFS_BUF_SET_FSPRIVATE(bp, mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); + } else { + XFS_BUF_STALE(bp); + error = xfs_bwrite(mp, bp); + } + + return (error); +} + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_do_quotaoff_trans( + xlog_t *log, + xlog_recover_item_t *item, + int pass) +{ + xfs_qoff_logformat_t *qoff_f; + + if (pass == XLOG_RECOVER_PASS2) { + return (0); + } + + qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return (0); +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_do_dquot_trans( + xlog_t *log, + xlog_recover_item_t *item, + int pass) +{ + xfs_mount_t *mp; + xfs_buf_t *bp; + struct xfs_disk_dquot *ddq, *recddq; + int error; + xfs_dq_logformat_t *dq_f; + uint type; + + if (pass == XLOG_RECOVER_PASS1) { + return 0; + } + mp = log->l_mp; + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return (0); + + recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; + ASSERT(recddq); + /* + * This type of quotas was turned off, so ignore this record. + */ + type = INT_GET(recddq->d_flags, ARCH_CONVERT) & + (XFS_DQ_USER | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return (0); + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; + ASSERT(dq_f); + if ((error = xfs_qm_dqcheck(recddq, + dq_f->qlf_id, + 0, XFS_QMOPT_DOWARN, + "xlog_recover_do_dquot_trans (log copy)"))) { + return XFS_ERROR(EIO); + } + ASSERT(dq_f->qlf_len == 1); + + error = xfs_read_buf(mp, mp->m_ddev_targp, + dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), + 0, &bp); + if (error) { + xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, + bp, dq_f->qlf_blkno); + return error; + } + ASSERT(bp); + ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * At least the magic num portion should be on disk because this + * was among a chunk of dquots created earlier, and we did some + * minimal initialization then. + */ + if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_do_dquot_trans")) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + + ASSERT(dq_f->qlf_size == 2); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || + XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); + XFS_BUF_SET_FSPRIVATE(bp, mp); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); + + return (0); +} + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC void +xlog_recover_do_efi_trans( + xlog_t *log, + xlog_recover_item_t *item, + xfs_lsn_t lsn, + int pass) +{ + xfs_mount_t *mp; + xfs_efi_log_item_t *efip; + xfs_efi_log_format_t *efi_formatp; + SPLDECL(s); + + if (pass == XLOG_RECOVER_PASS1) { + return; + } + + efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == + (sizeof(xfs_efi_log_format_t) + + ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)))); + + mp = log->l_mp; + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + memcpy((char *)&(efip->efi_format), (char *)efi_formatp, + sizeof(xfs_efi_log_format_t) + + ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))); + efip->efi_next_extent = efi_formatp->efi_nextents; + efip->efi_flags |= XFS_EFI_COMMITTED; + + AIL_LOCK(mp,s); + /* + * xfs_trans_update_ail() drops the AIL lock. + */ + xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s); +} + + +/* + * This routine is called when an efd format structure is found in + * a committed transaction in the log. It's purpose is to cancel + * the corresponding efi if it was still in the log. To do this + * it searches the AIL for the efi with an id equal to that in the + * efd format structure. If we find it, we remove the efi from the + * AIL and free it. + */ +STATIC void +xlog_recover_do_efd_trans( + xlog_t *log, + xlog_recover_item_t *item, + int pass) +{ + xfs_mount_t *mp; + xfs_efd_log_format_t *efd_formatp; + xfs_efi_log_item_t *efip = NULL; + xfs_log_item_t *lip; + int gen; + int nexts; + __uint64_t efi_id; + SPLDECL(s); + + if (pass == XLOG_RECOVER_PASS1) { + return; + } + + efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == + (sizeof(xfs_efd_log_format_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t)))); + efi_id = efd_formatp->efd_efi_id; + + /* + * Search for the efi with the id in the efd format structure + * in the AIL. + */ + mp = log->l_mp; + AIL_LOCK(mp,s); + lip = xfs_trans_first_ail(mp, &gen); + while (lip != NULL) { + if (lip->li_type == XFS_LI_EFI) { + efip = (xfs_efi_log_item_t *)lip; + if (efip->efi_format.efi_id == efi_id) { + /* + * xfs_trans_delete_ail() drops the + * AIL lock. + */ + xfs_trans_delete_ail(mp, lip, s); + break; + } + } + lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + } + if (lip == NULL) { + AIL_UNLOCK(mp, s); + } + + /* + * If we found it, then free it up. If it wasn't there, it + * must have been overwritten in the log. Oh well. + */ + if (lip != NULL) { + nexts = efip->efi_format.efi_nextents; + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + kmem_free(lip, sizeof(xfs_efi_log_item_t) + + ((nexts - 1) * sizeof(xfs_extent_t))); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } + } +} + +/* + * Perform the transaction + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int +xlog_recover_do_trans( + xlog_t *log, + xlog_recover_t *trans, + int pass) +{ + int error = 0; + xlog_recover_item_t *item, *first_item; + + if ((error = xlog_recover_reorder_trans(log, trans))) + return error; + first_item = item = trans->r_itemq; + do { + /* + * we don't need to worry about the block number being + * truncated in > 1 TB buffers because in user-land, + * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so + * the blkno's will get through the user-mode buffer + * cache properly. The only bad case is o32 kernels + * where xfs_daddr_t is 32-bits but mount will warn us + * off a > 1 TB filesystem before we get here. + */ + if ((ITEM_TYPE(item) == XFS_LI_BUF) || + (ITEM_TYPE(item) == XFS_LI_6_1_BUF) || + (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) { + if ((error = xlog_recover_do_buffer_trans(log, item, + pass))) + break; + } else if ((ITEM_TYPE(item) == XFS_LI_INODE) || + (ITEM_TYPE(item) == XFS_LI_6_1_INODE) || + (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) { + if ((error = xlog_recover_do_inode_trans(log, item, + pass))) + break; + } else if (ITEM_TYPE(item) == XFS_LI_EFI) { + xlog_recover_do_efi_trans(log, item, trans->r_lsn, + pass); + } else if (ITEM_TYPE(item) == XFS_LI_EFD) { + xlog_recover_do_efd_trans(log, item, pass); + } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { + if ((error = xlog_recover_do_dquot_trans(log, item, + pass))) + break; + } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { + if ((error = xlog_recover_do_quotaoff_trans(log, item, + pass))) + break; + } else { + xlog_warn("XFS: xlog_recover_do_trans"); + ASSERT(0); + error = XFS_ERROR(EIO); + break; + } + item = item->ri_next; + } while (first_item != item); + + return error; +} + +/* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + xlog_recover_t *trans) +{ + xlog_recover_item_t *first_item, *item, *free_item; + int i; + + item = first_item = trans->r_itemq; + do { + free_item = item; + item = item->ri_next; + /* Free the regions in the item. */ + for (i = 0; i < free_item->ri_cnt; i++) { + kmem_free(free_item->ri_buf[i].i_addr, + free_item->ri_buf[i].i_len); + } + /* Free the item itself */ + kmem_free(free_item->ri_buf, + (free_item->ri_total * sizeof(xfs_log_iovec_t))); + kmem_free(free_item, sizeof(xlog_recover_item_t)); + } while (first_item != item); + /* Free the transaction recover structure */ + kmem_free(trans, sizeof(xlog_recover_t)); +} + +STATIC int +xlog_recover_commit_trans( + xlog_t *log, + xlog_recover_t **q, + xlog_recover_t *trans, + int pass) +{ + int error; + + if ((error = xlog_recover_unlink_tid(q, trans))) + return error; + if ((error = xlog_recover_do_trans(log, trans, pass))) + return error; + xlog_recover_free_trans(trans); /* no error */ + return 0; +} + +STATIC int +xlog_recover_unmount_trans( + xlog_recover_t *trans) +{ + /* Do nothing now */ + xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); + return 0; +} + +/* + * There are two valid states of the r_state field. 0 indicates that the + * transaction structure is in a normal state. We have either seen the + * start of the transaction or the last operation we added was not a partial + * operation. If the last operation we added to the transaction was a + * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. + * + * NOTE: skip LRs with 0 data length. + */ +STATIC int +xlog_recover_process_data( + xlog_t *log, + xlog_recover_t *rhash[], + xlog_rec_header_t *rhead, + xfs_caddr_t dp, + int pass) +{ + xfs_caddr_t lp; + int num_logops; + xlog_op_header_t *ohead; + xlog_recover_t *trans; + xlog_tid_t tid; + int error; + unsigned long hash; + uint flags; + + lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT); + num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT); + + /* check the log format matches our own - else we can't recover */ + if (xlog_header_check_recover(log->l_mp, rhead)) + return (XFS_ERROR(EIO)); + + while ((dp < lp) && num_logops) { + ASSERT(dp + sizeof(xlog_op_header_t) <= lp); + ohead = (xlog_op_header_t *)dp; + dp += sizeof(xlog_op_header_t); + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xlog_warn( + "XFS: xlog_recover_process_data: bad clientid"); + ASSERT(0); + return (XFS_ERROR(EIO)); + } + tid = INT_GET(ohead->oh_tid, ARCH_CONVERT); + hash = XLOG_RHASH(tid); + trans = xlog_recover_find_tid(rhash[hash], tid); + if (trans == NULL) { /* not found; add new tid */ + if (ohead->oh_flags & XLOG_START_TRANS) + xlog_recover_new_tid(&rhash[hash], tid, + INT_GET(rhead->h_lsn, ARCH_CONVERT)); + } else { + ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp); + flags = ohead->oh_flags & ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + switch (flags) { + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, + &rhash[hash], trans, pass); + break; + case XLOG_UNMOUNT_TRANS: + error = xlog_recover_unmount_trans(trans); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(trans, + dp, INT_GET(ohead->oh_len, + ARCH_CONVERT)); + break; + case XLOG_START_TRANS: + xlog_warn( + "XFS: xlog_recover_process_data: bad transaction"); + ASSERT(0); + error = XFS_ERROR(EIO); + break; + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(trans, + dp, INT_GET(ohead->oh_len, + ARCH_CONVERT)); + break; + default: + xlog_warn( + "XFS: xlog_recover_process_data: bad flag"); + ASSERT(0); + error = XFS_ERROR(EIO); + break; + } + if (error) + return error; + } + dp += INT_GET(ohead->oh_len, ARCH_CONVERT); + num_logops--; + } + return 0; +} + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +STATIC void +xlog_recover_process_efi( + xfs_mount_t *mp, + xfs_efi_log_item_t *efip) +{ + xfs_efd_log_item_t *efdp; + xfs_trans_t *tp; + int i; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + + ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if ((startblock_fsb == 0) || + (extp->ext_len == 0) || + (startblock_fsb >= mp->m_sb.sb_dblocks) || + (extp->ext_len >= mp->m_sb.sb_agblocks)) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + xfs_efi_release(efip, efip->efi_format.efi_nextents); + return; + } + } + + tp = xfs_trans_alloc(mp, 0); + xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + xfs_free_extent(tp, extp->ext_start, extp->ext_len); + xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, + extp->ext_len); + } + + efip->efi_flags |= XFS_EFI_RECOVERED; + xfs_trans_commit(tp, 0, NULL); +} + +/* + * Verify that once we've encountered something other than an EFI + * in the AIL that there are no more EFIs in the AIL. + */ +#if defined(DEBUG) +STATIC void +xlog_recover_check_ail( + xfs_mount_t *mp, + xfs_log_item_t *lip, + int gen) +{ + int orig_gen = gen; + + do { + ASSERT(lip->li_type != XFS_LI_EFI); + lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + /* + * The check will be bogus if we restart from the + * beginning of the AIL, so ASSERT that we don't. + * We never should since we're holding the AIL lock + * the entire time. + */ + ASSERT(gen == orig_gen); + } while (lip != NULL); +} +#endif /* DEBUG */ + +/* + * When this is called, all of the EFIs which did not have + * corresponding EFDs should be in the AIL. What we do now + * is free the extents associated with each one. + * + * Since we process the EFIs in normal transactions, they + * will be removed at some point after the commit. This prevents + * us from just walking down the list processing each one. + * We'll use a flag in the EFI to skip those that we've already + * processed and use the AIL iteration mechanism's generation + * count to try to speed this up at least a bit. + * + * When we start, we know that the EFIs are the only things in + * the AIL. As we process them, however, other items are added + * to the AIL. Since everything added to the AIL must come after + * everything already in the AIL, we stop processing as soon as + * we see something other than an EFI in the AIL. + */ +STATIC void +xlog_recover_process_efis( + xlog_t *log) +{ + xfs_log_item_t *lip; + xfs_efi_log_item_t *efip; + int gen; + xfs_mount_t *mp; + SPLDECL(s); + + mp = log->l_mp; + AIL_LOCK(mp,s); + + lip = xfs_trans_first_ail(mp, &gen); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + */ + if (lip->li_type != XFS_LI_EFI) { + xlog_recover_check_ail(mp, lip, gen); + break; + } + + /* + * Skip EFIs that we've already processed. + */ + efip = (xfs_efi_log_item_t *)lip; + if (efip->efi_flags & XFS_EFI_RECOVERED) { + lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + continue; + } + + AIL_UNLOCK(mp, s); + xlog_recover_process_efi(mp, efip); + AIL_LOCK(mp,s); + lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + } + AIL_UNLOCK(mp, s); +} + +/* + * This routine performs a transaction to null out a bad inode pointer + * in an agi unlinked inode hash bucket. + */ +STATIC void +xlog_recover_clear_agi_bucket( + xfs_mount_t *mp, + xfs_agnumber_t agno, + int bucket) +{ + xfs_trans_t *tp; + xfs_agi_t *agi; + xfs_buf_t *agibp; + int offset; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); + xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &agibp); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return; + } + + agi = XFS_BUF_TO_AGI(agibp); + if (INT_GET(agi->agi_magicnum, ARCH_CONVERT) != XFS_AGI_MAGIC) { + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return; + } + ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC); + + INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + + (void) xfs_trans_commit(tp, 0, NULL); +} + +/* + * xlog_iunlink_recover + * + * This is called during recovery to process any inodes which + * we unlinked but not freed when the system crashed. These + * inodes will be on the lists in the AGI blocks. What we do + * here is scan all the AGIs and fully truncate and free any + * inodes found on the lists. Each inode is removed from the + * lists when it has been fully truncated and is freed. The + * freeing of the inode and its removal from the list must be + * atomic. + */ +void +xlog_recover_process_iunlinks( + xlog_t *log) +{ + xfs_mount_t *mp; + xfs_agnumber_t agno; + xfs_agi_t *agi; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_dinode_t *dip; + xfs_inode_t *ip; + xfs_agino_t agino; + xfs_ino_t ino; + int bucket; + int error; + uint mp_dmevmask; + + mp = log->l_mp; + + /* + * Prevent any DMAPI event from being sent while in this function. + */ + mp_dmevmask = mp->m_dmevmask; + mp->m_dmevmask = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + /* + * Find the agi for this ag. + */ + agibp = xfs_buf_read(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (XFS_BUF_ISERROR(agibp)) { + xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)", + log->l_mp, agibp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp))); + } + agi = XFS_BUF_TO_AGI(agibp); + ASSERT(XFS_AGI_MAGIC == + INT_GET(agi->agi_magicnum, ARCH_CONVERT)); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + + agino = INT_GET(agi->agi_unlinked[bucket], ARCH_CONVERT); + while (agino != NULLAGINO) { + + /* + * Release the agi buffer so that it can + * be acquired in the normal course of the + * transaction to truncate and free the inode. + */ + xfs_buf_relse(agibp); + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); + ASSERT(error || (ip != NULL)); + + if (!error) { + /* + * Get the on disk inode to find the + * next inode in the bucket. + */ + error = xfs_itobp(mp, NULL, ip, &dip, + &ibp, 0); + ASSERT(error || (dip != NULL)); + } + + if (!error) { + ASSERT(ip->i_d.di_nlink == 0); + + /* setup for the next pass */ + agino = INT_GET(dip->di_next_unlinked, + ARCH_CONVERT); + xfs_buf_relse(ibp); + /* + * Prevent any DMAPI event from + * being sent when the + * reference on the inode is + * dropped. + */ + ip->i_d.di_dmevmask = 0; + + /* + * If this is a new inode, handle + * it specially. Otherwise, + * just drop our reference to the + * inode. If there are no + * other references, this will + * send the inode to + * xfs_inactive() which will + * truncate the file and free + * the inode. + */ + if (ip->i_d.di_mode == 0) + xfs_iput_new(ip, 0); + else + VN_RELE(XFS_ITOV(ip)); + } else { + /* + * We can't read in the inode + * this bucket points to, or + * this inode is messed up. Just + * ditch this bucket of inodes. We + * will lose some inodes and space, + * but at least we won't hang. Call + * xlog_recover_clear_agi_bucket() + * to perform a transaction to clear + * the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, + bucket); + + agino = NULLAGINO; + } + + /* + * Reacquire the agibuffer and continue around + * the loop. + */ + agibp = xfs_buf_read(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, + XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (XFS_BUF_ISERROR(agibp)) { + xfs_ioerror_alert( + "xlog_recover_process_iunlinks(#2)", + log->l_mp, agibp, + XFS_AG_DADDR(mp, agno, + XFS_AGI_DADDR(mp))); + } + agi = XFS_BUF_TO_AGI(agibp); + ASSERT(XFS_AGI_MAGIC == INT_GET( + agi->agi_magicnum, ARCH_CONVERT)); + } + } + + /* + * Release the buffer for the current agi so we can + * go on to the next one. + */ + xfs_buf_relse(agibp); + } + + mp->m_dmevmask = mp_dmevmask; +} + + +#ifdef DEBUG +STATIC void +xlog_pack_data_checksum( + xlog_t *log, + xlog_in_core_t *iclog, + int size) +{ + int i; + uint *up; + uint chksum = 0; + + up = (uint *)iclog->ic_datap; + /* divide length by 4 to get # words */ + for (i = 0; i < (size >> 2); i++) { + chksum ^= INT_GET(*up, ARCH_CONVERT); + up++; + } + INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum); +} +#else +#define xlog_pack_data_checksum(log, iclog, size) +#endif + +/* + * Stamp cycle number in every block + */ +void +xlog_pack_data( + xlog_t *log, + xlog_in_core_t *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + uint cycle_lsn; + xfs_caddr_t dp; + xlog_in_core_2_t *xhdr; + + xlog_pack_data_checksum(log, iclog, size); + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + iclog->ic_header.h_cycle_data[i] = *(uint *)dp; + *(uint *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + xhdr = (xlog_in_core_2_t *)&iclog->ic_header; + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp; + *(uint *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) { + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } + } +} + +#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) +STATIC void +xlog_unpack_data_checksum( + xlog_rec_header_t *rhead, + xfs_caddr_t dp, + xlog_t *log) +{ + uint *up = (uint *)dp; + uint chksum = 0; + int i; + + /* divide length by 4 to get # words */ + for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) { + chksum ^= INT_GET(*up, ARCH_CONVERT); + up++; + } + if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) { + if (rhead->h_chksum || + ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { + cmn_err(CE_DEBUG, + "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)", + INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum); + cmn_err(CE_DEBUG, +"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + cmn_err(CE_DEBUG, + "XFS: LogR this is a LogV2 filesystem"); + } + log->l_flags |= XLOG_CHKSUM_MISMATCH; + } + } +} +#else +#define xlog_unpack_data_checksum(rhead, dp, log) +#endif + +STATIC void +xlog_unpack_data( + xlog_rec_header_t *rhead, + xfs_caddr_t dp, + xlog_t *log) +{ + int i, j, k; + xlog_in_core_2_t *xhdr; + + for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(uint *)dp = *(uint *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + xlog_unpack_data_checksum(rhead, dp, log); +} + +STATIC int +xlog_valid_rec_header( + xlog_t *log, + xlog_rec_header_t *rhead, + xfs_daddr_t blkno) +{ + int hlen; + + if (unlikely( + (INT_GET(rhead->h_magicno, ARCH_CONVERT) != + XLOG_HEADER_MAGIC_NUM))) { + XFS_ERROR_REPORT("xlog_valid_rec_header(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely( + (!rhead->h_version || + (INT_GET(rhead->h_version, ARCH_CONVERT) & + (~XLOG_VERSION_OKBITS)) != 0))) { + xlog_warn("XFS: %s: unrecognised log version (%d).", + __FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT)); + return XFS_ERROR(EIO); + } + + /* LR body must have data or it wouldn't have been written */ + hlen = INT_GET(rhead->h_len, ARCH_CONVERT); + if (unlikely( hlen <= 0 || hlen > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(3)", + XFS_ERRLEVEL_LOW, log->l_mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +/* + * Read the log from tail to head and process the log records found. + * Handle the two cases where the tail and head are in the same cycle + * and where the active portion of the log wraps around the end of + * the physical log separately. The pass parameter is passed through + * to the routines called to process the data and is not looked at + * here. + */ +STATIC int +xlog_do_recovery_pass( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int pass) +{ + xlog_rec_header_t *rhead; + xfs_daddr_t blk_no; + xfs_caddr_t bufaddr, offset; + xfs_buf_t *hbp, *dbp; + int error = 0, h_size; + int bblks, split_bblks; + int hblks, split_hblks, wrapped_hblks; + xlog_recover_t *rhash[XLOG_RHASH_SIZE]; + + ASSERT(head_blk != tail_blk); + + /* + * Read the header of the tail block and get the iclog buffer size from + * h_size. Use this to tell how many sectors make up the log header. + */ + if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + /* + * When using variable length iclogs, read first sector of + * iclog header and extract the header size from it. Get a + * new hbp that is the correct size. + */ + hbp = xlog_get_bp(log, 1); + if (!hbp) + return ENOMEM; + if ((error = xlog_bread(log, tail_blk, 1, hbp))) + goto bread_err1; + offset = xlog_align(log, tail_blk, 1, hbp); + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, tail_blk); + if (error) + goto bread_err1; + h_size = INT_GET(rhead->h_size, ARCH_CONVERT); + if ((INT_GET(rhead->h_version, ARCH_CONVERT) + & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + xlog_put_bp(hbp); + hbp = xlog_get_bp(log, hblks); + } else { + hblks = 1; + } + } else { + ASSERT(log->l_sectbb_log == 0); + hblks = 1; + hbp = xlog_get_bp(log, 1); + h_size = XLOG_BIG_RECORD_BSIZE; + } + + if (!hbp) + return ENOMEM; + dbp = xlog_get_bp(log, BTOBB(h_size)); + if (!dbp) { + xlog_put_bp(hbp); + return ENOMEM; + } + + memset(rhash, 0, sizeof(rhash)); + if (tail_blk <= head_blk) { + for (blk_no = tail_blk; blk_no < head_blk; ) { + if ((error = xlog_bread(log, blk_no, hblks, hbp))) + goto bread_err2; + offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + + /* blocks in data section */ + bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + error = xlog_bread(log, blk_no + hblks, bblks, dbp); + if (error) + goto bread_err2; + offset = xlog_align(log, blk_no + hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, + rhash, rhead, offset, pass))) + goto bread_err2; + blk_no += bblks + hblks; + } + } else { + /* + * Perform recovery around the end of the physical log. + * When the head is not on the same cycle number as the tail, + * we can't do a sequential recovery as above. + */ + blk_no = tail_blk; + while (blk_no < log->l_logBBsize) { + /* + * Check for header wrapping around physical end-of-log + */ + offset = NULL; + split_hblks = 0; + wrapped_hblks = 0; + if (blk_no + hblks <= log->l_logBBsize) { + /* Read header in one read */ + error = xlog_bread(log, blk_no, hblks, hbp); + if (error) + goto bread_err2; + offset = xlog_align(log, blk_no, hblks, hbp); + } else { + /* This LR is split across physical log end */ + if (blk_no != log->l_logBBsize) { + /* some data before physical log end */ + ASSERT(blk_no <= INT_MAX); + split_hblks = log->l_logBBsize - (int)blk_no; + ASSERT(split_hblks > 0); + if ((error = xlog_bread(log, blk_no, + split_hblks, hbp))) + goto bread_err2; + offset = xlog_align(log, blk_no, + split_hblks, hbp); + } + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + bufaddr = XFS_BUF_PTR(hbp); + XFS_BUF_SET_PTR(hbp, + bufaddr + BBTOB(split_hblks), + BBTOB(hblks - split_hblks)); + wrapped_hblks = hblks - split_hblks; + error = xlog_bread(log, 0, wrapped_hblks, hbp); + if (error) + goto bread_err2; + XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); + if (!offset) + offset = xlog_align(log, 0, + wrapped_hblks, hbp); + } + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, + split_hblks ? blk_no : 0); + if (error) + goto bread_err2; + + bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + blk_no += hblks; + + /* Read in data for log record */ + if (blk_no + bblks <= log->l_logBBsize) { + error = xlog_bread(log, blk_no, bblks, dbp); + if (error) + goto bread_err2; + offset = xlog_align(log, blk_no, bblks, dbp); + } else { + /* This log record is split across the + * physical end of log */ + offset = NULL; + split_bblks = 0; + if (blk_no != log->l_logBBsize) { + /* some data is before the physical + * end of log */ + ASSERT(!wrapped_hblks); + ASSERT(blk_no <= INT_MAX); + split_bblks = + log->l_logBBsize - (int)blk_no; + ASSERT(split_bblks > 0); + if ((error = xlog_bread(log, blk_no, + split_bblks, dbp))) + goto bread_err2; + offset = xlog_align(log, blk_no, + split_bblks, dbp); + } + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + bufaddr = XFS_BUF_PTR(dbp); + XFS_BUF_SET_PTR(dbp, + bufaddr + BBTOB(split_bblks), + BBTOB(bblks - split_bblks)); + if ((error = xlog_bread(log, wrapped_hblks, + bblks - split_bblks, dbp))) + goto bread_err2; + XFS_BUF_SET_PTR(dbp, bufaddr, h_size); + if (!offset) + offset = xlog_align(log, wrapped_hblks, + bblks - split_bblks, dbp); + } + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, rhash, + rhead, offset, pass))) + goto bread_err2; + blk_no += bblks; + } + + ASSERT(blk_no >= log->l_logBBsize); + blk_no -= log->l_logBBsize; + + /* read first part of physical log */ + while (blk_no < head_blk) { + if ((error = xlog_bread(log, blk_no, hblks, hbp))) + goto bread_err2; + offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) + goto bread_err2; + offset = xlog_align(log, blk_no+hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); + if ((error = xlog_recover_process_data(log, rhash, + rhead, offset, pass))) + goto bread_err2; + blk_no += bblks + hblks; + } + } + + bread_err2: + xlog_put_bp(dbp); + bread_err1: + xlog_put_bp(hbp); + return error; +} + +/* + * Do the recovery of the log. We actually do this in two phases. + * The two passes are necessary in order to implement the function + * of cancelling a record written into the log. The first pass + * determines those things which have been cancelled, and the + * second pass replays log items normally except for those which + * have been cancelled. The handling of the replay and cancellations + * takes place in the log item type specific routines. + * + * The table of items which have cancel records in the log is allocated + * and freed at this level, since only here do we know when all of + * the log recovery has been completed. + */ +STATIC int +xlog_do_log_recovery( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error; + + ASSERT(head_blk != tail_blk); + + /* + * First do a pass to find all of the cancelled buf log items. + * Store them in the buf_cancel_table for use in the second pass. + */ + log->l_buf_cancel_table = + (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(xfs_buf_cancel_t*), + KM_SLEEP); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS1); + if (error != 0) { + kmem_free(log->l_buf_cancel_table, + XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + log->l_buf_cancel_table = NULL; + return error; + } + /* + * Then do a second pass to actually recover the items in the log. + * When it is complete free the table of buf cancel items. + */ + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS2); +#ifdef DEBUG + { + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(log->l_buf_cancel_table[i] == NULL); + } +#endif /* DEBUG */ + + kmem_free(log->l_buf_cancel_table, + XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + log->l_buf_cancel_table = NULL; + + return error; +} + +/* + * Do the actual recovery + */ +STATIC int +xlog_do_recover( + xlog_t *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error; + xfs_buf_t *bp; + xfs_sb_t *sbp; + + /* + * First replay the images in the log. + */ + error = xlog_do_log_recovery(log, head_blk, tail_blk); + if (error) { + return error; + } + + XFS_bflush(log->l_mp->m_ddev_targp); + + /* + * If IO errors happened during recovery, bail out. + */ + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + return (EIO); + } + + /* + * We now update the tail_lsn since much of the recovery has completed + * and there may be space available to use. If there were no extent + * or iunlinks, we can free up the entire log and set the tail_lsn to + * be the last_sync_lsn. This was set in xlog_find_tail to be the + * lsn of the last known good LR on disk. If there are extent frees + * or iunlinks they will have some entries in the AIL; so we look at + * the AIL to determine how to set the tail_lsn. + */ + xlog_assign_tail_lsn(log->l_mp); + + /* + * Now that we've finished replaying all buffer and inode + * updates, re-read in the superblock. + */ + bp = xfs_getsb(log->l_mp, 0); + XFS_BUF_UNDONE(bp); + XFS_BUF_READ(bp); + xfsbdstrat(log->l_mp, bp); + if ((error = xfs_iowait(bp))) { + xfs_ioerror_alert("xlog_do_recover", + log->l_mp, bp, XFS_BUF_ADDR(bp)); + ASSERT(0); + xfs_buf_relse(bp); + return error; + } + + /* Convert superblock from on-disk format */ + sbp = &log->l_mp->m_sb; + xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS); + ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); + ASSERT(XFS_SB_GOOD_VERSION(sbp)); + xfs_buf_relse(bp); + + xlog_recover_check_summary(log); + + /* Normal transactions can now occur */ + log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + return 0; +} + +/* + * Perform recovery and re-initialize some log variables in xlog_find_tail. + * + * Return error or zero. + */ +int +xlog_recover( + xlog_t *log, + int readonly) +{ + xfs_daddr_t head_blk, tail_blk; + int error; + + /* find the tail of the log */ + if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly))) + return error; + + if (tail_blk != head_blk) { + /* There used to be a comment here: + * + * disallow recovery on read-only mounts. note -- mount + * checks for ENOSPC and turns it into an intelligent + * error message. + * ...but this is no longer true. Now, unless you specify + * NORECOVERY (in which case this function would never be + * called), we just go ahead and recover. We do this all + * under the vfs layer, so we can get away with it unless + * the device itself is read-only, in which case we fail. + */ + if ((error = xfs_dev_is_read_only(log->l_mp, + "recovery required"))) { + return error; + } + + cmn_err(CE_NOTE, + "Starting XFS recovery on filesystem: %s (dev: %s)", + log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ)); + + error = xlog_do_recover(log, head_blk, tail_blk); + log->l_flags |= XLOG_RECOVERY_NEEDED; + } + return error; +} + +/* + * In the first part of recovery we replay inodes and buffers and build + * up the list of extent free items which need to be processed. Here + * we process the extent free items and clean up the on disk unlinked + * inode lists. This is separated from the first part of recovery so + * that the root and real-time bitmap inodes can be read in from disk in + * between the two stages. This is necessary so that we can free space + * in the real-time portion of the file system. + */ +int +xlog_recover_finish( + xlog_t *log, + int mfsi_flags) +{ + /* + * Now we're ready to do the transactions needed for the + * rest of recovery. Start with completing all the extent + * free intent records and then process the unlinked inode + * lists. At this point, we essentially run in normal mode + * except that we're still performing recovery actions + * rather than accepting new requests. + */ + if (log->l_flags & XLOG_RECOVERY_NEEDED) { + xlog_recover_process_efis(log); + /* + * Sync the log to get all the EFIs out of the AIL. + * This isn't absolutely necessary, but it helps in + * case the unlink transactions would have problems + * pushing the EFIs out of the way. + */ + xfs_log_force(log->l_mp, (xfs_lsn_t)0, + (XFS_LOG_FORCE | XFS_LOG_SYNC)); + + if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) { + xlog_recover_process_iunlinks(log); + } + + xlog_recover_check_summary(log); + + cmn_err(CE_NOTE, + "Ending XFS recovery on filesystem: %s (dev: %s)", + log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ)); + log->l_flags &= ~XLOG_RECOVERY_NEEDED; + } else { + cmn_err(CE_DEBUG, + "!Ending clean XFS mount for filesystem: %s", + log->l_mp->m_fsname); + } + return 0; +} + + +#if defined(DEBUG) +/* + * Read all of the agf and agi counters and check that they + * are consistent with the superblock counters. + */ +void +xlog_recover_check_summary( + xlog_t *log) +{ + xfs_mount_t *mp; + xfs_agf_t *agfp; + xfs_agi_t *agip; + xfs_buf_t *agfbp; + xfs_buf_t *agibp; + xfs_daddr_t agfdaddr; + xfs_daddr_t agidaddr; + xfs_buf_t *sbbp; +#ifdef XFS_LOUD_RECOVERY + xfs_sb_t *sbp; +#endif + xfs_agnumber_t agno; + __uint64_t freeblks; + __uint64_t itotal; + __uint64_t ifree; + + mp = log->l_mp; + + freeblks = 0LL; + itotal = 0LL; + ifree = 0LL; + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)); + agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, + XFS_FSS_TO_BB(mp, 1), 0); + if (XFS_BUF_ISERROR(agfbp)) { + xfs_ioerror_alert("xlog_recover_check_summary(agf)", + mp, agfbp, agfdaddr); + } + agfp = XFS_BUF_TO_AGF(agfbp); + ASSERT(XFS_AGF_MAGIC == + INT_GET(agfp->agf_magicnum, ARCH_CONVERT)); + ASSERT(XFS_AGF_GOOD_VERSION( + INT_GET(agfp->agf_versionnum, ARCH_CONVERT))); + ASSERT(INT_GET(agfp->agf_seqno, ARCH_CONVERT) == agno); + + freeblks += INT_GET(agfp->agf_freeblks, ARCH_CONVERT) + + INT_GET(agfp->agf_flcount, ARCH_CONVERT); + xfs_buf_relse(agfbp); + + agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); + agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, + XFS_FSS_TO_BB(mp, 1), 0); + if (XFS_BUF_ISERROR(agibp)) { + xfs_ioerror_alert("xlog_recover_check_summary(agi)", + mp, agibp, agidaddr); + } + agip = XFS_BUF_TO_AGI(agibp); + ASSERT(XFS_AGI_MAGIC == + INT_GET(agip->agi_magicnum, ARCH_CONVERT)); + ASSERT(XFS_AGI_GOOD_VERSION( + INT_GET(agip->agi_versionnum, ARCH_CONVERT))); + ASSERT(INT_GET(agip->agi_seqno, ARCH_CONVERT) == agno); + + itotal += INT_GET(agip->agi_count, ARCH_CONVERT); + ifree += INT_GET(agip->agi_freecount, ARCH_CONVERT); + xfs_buf_relse(agibp); + } + + sbbp = xfs_getsb(mp, 0); +#ifdef XFS_LOUD_RECOVERY + sbp = &mp->m_sb; + xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS); + cmn_err(CE_NOTE, + "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", + sbp->sb_icount, itotal); + cmn_err(CE_NOTE, + "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", + sbp->sb_ifree, ifree); + cmn_err(CE_NOTE, + "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", + sbp->sb_fdblocks, freeblks); +#if 0 + /* + * This is turned off until I account for the allocation + * btree blocks which live in free space. + */ + ASSERT(sbp->sb_icount == itotal); + ASSERT(sbp->sb_ifree == ifree); + ASSERT(sbp->sb_fdblocks == freeblks); +#endif +#endif + xfs_buf_relse(sbbp); +} +#endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h new file mode 100644 index 00000000000..42158b442b5 --- /dev/null +++ b/fs/xfs/xfs_log_recover.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LOG_RECOVER_H__ +#define __XFS_LOG_RECOVER_H__ + +/* + * Macros, structures, prototypes for internal log manager use. + */ + +#define XLOG_RHASH_BITS 4 +#define XLOG_RHASH_SIZE 16 +#define XLOG_RHASH_SHIFT 2 +#define XLOG_RHASH(tid) \ + ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) + + +/* + * item headers are in ri_buf[0]. Additional buffers follow. + */ +typedef struct xlog_recover_item { + struct xlog_recover_item *ri_next; + struct xlog_recover_item *ri_prev; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ +} xlog_recover_item_t; + +struct xlog_tid; +typedef struct xlog_recover { + struct xlog_recover *r_next; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + xlog_recover_item_t *r_itemq; /* q for items */ +} xlog_recover_t; + +#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) + +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_RECOVER_PASS1 1 +#define XLOG_RECOVER_PASS2 2 + +#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h new file mode 100644 index 00000000000..8d59aaffeb8 --- /dev/null +++ b/fs/xfs/xfs_mac.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_MAC_H__ +#define __XFS_MAC_H__ + +/* + * Mandatory Access Control + * + * Layout of a composite MAC label: + * ml_list contains the list of categories (MSEN) followed by the list of + * divisions (MINT). This is actually a header for the data structure which + * will have an ml_list with more than one element. + * + * ------------------------------- + * | ml_msen_type | ml_mint_type | + * ------------------------------- + * | ml_level | ml_grade | + * ------------------------------- + * | ml_catcount | + * ------------------------------- + * | ml_divcount | + * ------------------------------- + * | category 1 | + * | . . . | + * | category N | (where N = ml_catcount) + * ------------------------------- + * | division 1 | + * | . . . | + * | division M | (where M = ml_divcount) + * ------------------------------- + */ +#define XFS_MAC_MAX_SETS 250 +typedef struct xfs_mac_label { + __uint8_t ml_msen_type; /* MSEN label type */ + __uint8_t ml_mint_type; /* MINT label type */ + __uint8_t ml_level; /* Hierarchical level */ + __uint8_t ml_grade; /* Hierarchical grade */ + __uint16_t ml_catcount; /* Category count */ + __uint16_t ml_divcount; /* Division count */ + /* Category set, then Division set */ + __uint16_t ml_list[XFS_MAC_MAX_SETS]; +} xfs_mac_label_t; + +/* MSEN label type names. Choose an upper case ASCII character. */ +#define XFS_MSEN_ADMIN_LABEL 'A' /* Admin: lowm_ail_lock, "xfs_ail"); + spinlock_init(&mp->m_sb_lock, "xfs_sb"); + mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock"); + initnsema(&mp->m_growlock, 1, "xfs_grow"); + /* + * Initialize the AIL. + */ + xfs_trans_ail_init(mp); + + atomic_set(&mp->m_active_trans, 0); + + return mp; +} + +/* + * Free up the resources associated with a mount structure. Assume that + * the structure was initially zeroed, so we can tell which fields got + * initialized. + */ +void +xfs_mount_free( + xfs_mount_t *mp, + int remove_bhv) +{ + if (mp->m_ihash) + xfs_ihash_free(mp); + if (mp->m_chash) + xfs_chash_free(mp); + + if (mp->m_perag) { + int agno; + + for (agno = 0; agno < mp->m_maxagi; agno++) + if (mp->m_perag[agno].pagb_list) + kmem_free(mp->m_perag[agno].pagb_list, + sizeof(xfs_perag_busy_t) * + XFS_PAGB_NUM_SLOTS); + kmem_free(mp->m_perag, + sizeof(xfs_perag_t) * mp->m_sb.sb_agcount); + } + + AIL_LOCK_DESTROY(&mp->m_ail_lock); + spinlock_destroy(&mp->m_sb_lock); + mutex_destroy(&mp->m_ilock); + freesema(&mp->m_growlock); + if (mp->m_quotainfo) + XFS_QM_DONE(mp); + + if (mp->m_fsname != NULL) + kmem_free(mp->m_fsname, mp->m_fsname_len); + + if (remove_bhv) { + struct vfs *vfsp = XFS_MTOVFS(mp); + + bhv_remove_all_vfsops(vfsp, 0); + VFS_REMOVEBHV(vfsp, &mp->m_bhv); + } + + kmem_free(mp, sizeof(xfs_mount_t)); +} + + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp) +{ + /* + * If the log device and data device have the + * same device number, the log is internal. + * Consequently, the sb_logstart should be non-zero. If + * we have a zero sb_logstart in this case, we may be trying to mount + * a volume filesystem in a non-volume manner. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + cmn_err(CE_WARN, "XFS: bad magic number"); + return XFS_ERROR(EWRONGFS); + } + + if (!XFS_SB_GOOD_VERSION(sbp)) { + cmn_err(CE_WARN, "XFS: bad version"); + return XFS_ERROR(EWRONGFS); + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + cmn_err(CE_WARN, + "XFS: filesystem is marked as having an external log; " + "specify logdev on the\nmount command line."); + XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)", + XFS_ERRLEVEL_HIGH, mp, sbp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + cmn_err(CE_WARN, + "XFS: filesystem is marked as having an internal log; " + "don't specify logdev on\nthe mount command line."); + XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)", + XFS_ERRLEVEL_HIGH, mp, sbp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * More sanity checking. These were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + sbp->sb_imax_pct > 100)) { + cmn_err(CE_WARN, "XFS: SB sanity check 1 failed"); + XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)", + XFS_ERRLEVEL_LOW, mp, sbp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Sanity check AG count, size fields against data size field + */ + if (unlikely( + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > + (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || + sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * + sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { + cmn_err(CE_WARN, "XFS: SB sanity check 2 failed"); + XFS_ERROR_REPORT("xfs_mount_validate_sb(4)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + +#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + if (unlikely( + (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX || + (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) { +#else /* Limited by UINT_MAX of sectors */ + if (unlikely( + (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || + (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { +#endif + cmn_err(CE_WARN, + "XFS: File system is too large to be mounted on this system."); + return XFS_ERROR(E2BIG); + } + + if (unlikely(sbp->sb_inprogress)) { + cmn_err(CE_WARN, "XFS: file system busy"); + XFS_ERROR_REPORT("xfs_mount_validate_sb(5)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + cmn_err(CE_WARN, + "XFS: Attempted to mount file system with blocksize %d bytes", + sbp->sb_blocksize); + cmn_err(CE_WARN, + "XFS: Only page-sized (%d) or less blocksizes currently work.", + PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } + + return 0; +} + +xfs_agnumber_t +xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index, max_metadata; + xfs_perag_t *pag; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_sb_t *sbp = &mp->m_sb; + xfs_ino_t max_inum = XFS_MAXINUMBER_32; + + /* Check to see if the filesystem can overflow 32 bit inodes */ + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + /* Clear the mount flag if no inode can overflow 32 bits + * on this filesystem, or if specifically requested.. + */ + if ((mp->m_flags & XFS_MOUNT_32BITINOOPT) && ino > max_inum) { + mp->m_flags |= XFS_MOUNT_32BITINODES; + } else { + mp->m_flags &= ~XFS_MOUNT_32BITINODES; + } + + /* If we can overflow then setup the ag headers accordingly */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + /* Calculate how much should be reserved for inodes to + * meet the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, mp->m_ialloc_blks); + max_metadata = icount; + } else { + max_metadata = agcount; + } + for (index = 0; index < agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + if (ino > max_inum) { + index++; + break; + } + + /* This ag is prefered for inodes */ + pag = &mp->m_perag[index]; + pag->pagi_inodeok = 1; + if (index < max_metadata) + pag->pagf_metadata = 1; + } + } else { + /* Setup default behavior for smaller filesystems */ + for (index = 0; index < agcount; index++) { + pag = &mp->m_perag[index]; + pag->pagi_inodeok = 1; + } + } + return index; +} + +/* + * xfs_xlatesb + * + * data - on disk version of sb + * sb - a superblock + * dir - conversion direction: <0 - convert sb to buf + * >0 - convert buf to sb + * fields - which fields to copy (bitmask) + */ +void +xfs_xlatesb( + void *data, + xfs_sb_t *sb, + int dir, + __int64_t fields) +{ + xfs_caddr_t buf_ptr; + xfs_caddr_t mem_ptr; + xfs_sb_field_t f; + int first; + int size; + + ASSERT(dir); + ASSERT(fields); + + if (!fields) + return; + + buf_ptr = (xfs_caddr_t)data; + mem_ptr = (xfs_caddr_t)sb; + + while (fields) { + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + first = xfs_sb_info[f].offset; + size = xfs_sb_info[f + 1].offset - first; + + ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + + if (size == 1 || xfs_sb_info[f].type == 1) { + if (dir > 0) { + memcpy(mem_ptr + first, buf_ptr + first, size); + } else { + memcpy(buf_ptr + first, mem_ptr + first, size); + } + } else { + switch (size) { + case 2: + INT_XLATE(*(__uint16_t*)(buf_ptr+first), + *(__uint16_t*)(mem_ptr+first), + dir, ARCH_CONVERT); + break; + case 4: + INT_XLATE(*(__uint32_t*)(buf_ptr+first), + *(__uint32_t*)(mem_ptr+first), + dir, ARCH_CONVERT); + break; + case 8: + INT_XLATE(*(__uint64_t*)(buf_ptr+first), + *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT); + break; + default: + ASSERT(0); + } + } + + fields &= ~(1LL << f); + } +} + +/* + * xfs_readsb + * + * Does the initial read of the superblock. + */ +int +xfs_readsb(xfs_mount_t *mp) +{ + unsigned int sector_size; + unsigned int extra_flags; + xfs_buf_t *bp; + xfs_sb_t *sbp; + int error; + + ASSERT(mp->m_sb_bp == NULL); + ASSERT(mp->m_ddev_targp != NULL); + + /* + * Allocate a (locked) buffer to hold the superblock. + * This will be kept around at all times to optimize + * access to the superblock. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; + + bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), extra_flags); + if (!bp || XFS_BUF_ISERROR(bp)) { + cmn_err(CE_WARN, "XFS: SB read failed"); + error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; + goto fail; + } + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + + /* + * Initialize the mount structure from the superblock. + * But first do some basic consistency checking. + */ + sbp = XFS_BUF_TO_SBP(bp); + xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); + + error = xfs_mount_validate_sb(mp, &(mp->m_sb)); + if (error) { + cmn_err(CE_WARN, "XFS: SB validate failed"); + goto fail; + } + + /* + * We must be able to do sector-sized and sector-aligned IO. + */ + if (sector_size > mp->m_sb.sb_sectsize) { + cmn_err(CE_WARN, + "XFS: device supports only %u byte sectors (not %u)", + sector_size, mp->m_sb.sb_sectsize); + error = ENOSYS; + goto fail; + } + + /* + * If device sector size is smaller than the superblock size, + * re-read the superblock so the buffer is correctly sized. + */ + if (sector_size < mp->m_sb.sb_sectsize) { + XFS_BUF_UNMANAGE(bp); + xfs_buf_relse(bp); + sector_size = mp->m_sb.sb_sectsize; + bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), extra_flags); + if (!bp || XFS_BUF_ISERROR(bp)) { + cmn_err(CE_WARN, "XFS: SB re-read failed"); + error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; + goto fail; + } + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + } + + mp->m_sb_bp = bp; + xfs_buf_relse(bp); + ASSERT(XFS_BUF_VALUSEMA(bp) > 0); + return 0; + + fail: + if (bp) { + XFS_BUF_UNMANAGE(bp); + xfs_buf_relse(bp); + } + return error; +} + + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) +{ + int i; + + mp->m_agfrotor = mp->m_agirotor = 0; + spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock"); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_litino = sbp->sb_inodesize - + ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t)); + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + INIT_LIST_HEAD(&mp->m_del_inodes); + + /* + * Setup for attributes, in case they get created. + * This value is for inodes getting attributes for the first time, + * the per-inode value is for old attribute values. + */ + ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048); + switch (sbp->sb_inodesize) { + case 256: + mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2); + break; + case 512: + case 1024: + case 2048: + mp->m_attroffset = XFS_BMDR_SPACE_CALC(12); + break; + default: + ASSERT(0); + } + ASSERT(mp->m_attroffset < XFS_LITINO(mp)); + + for (i = 0; i < 2; i++) { + mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, + xfs_alloc, i == 0); + mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, + xfs_alloc, i == 0); + } + for (i = 0; i < 2; i++) { + mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, + xfs_bmbt, i == 0); + mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, + xfs_bmbt, i == 0); + } + for (i = 0; i < 2; i++) { + mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, + xfs_inobt, i == 0); + mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, + xfs_inobt, i == 0); + } + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} +/* + * xfs_mountfs + * + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + vfs_t *vfsp, + xfs_mount_t *mp, + int mfsi_flags) +{ + xfs_buf_t *bp; + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + vnode_t *rvp = NULL; + int readio_log, writeio_log; + xfs_daddr_t d; + __uint64_t ret64; + __int64_t update_flags; + uint quotamount, quotaflags; + int agno; + int uuid_mounted = 0; + int error = 0; + + if (mp->m_sb_bp == NULL) { + if ((error = xfs_readsb(mp))) { + return (error); + } + } + xfs_mount_common(mp, sbp); + + /* + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. + */ + update_flags = 0LL; + if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + if (mp->m_flags & XFS_MOUNT_RETERR) { + cmn_err(CE_WARN, + "XFS: alignment check 1 failed"); + error = XFS_ERROR(EINVAL); + goto error1; + } + mp->m_dalign = mp->m_swidth = 0; + } else { + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { + if (mp->m_flags & XFS_MOUNT_RETERR) { + error = XFS_ERROR(EINVAL); + goto error1; + } + xfs_fs_cmn_err(CE_WARN, mp, +"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", + mp->m_dalign, mp->m_swidth, + sbp->sb_agblocks); + + mp->m_dalign = 0; + mp->m_swidth = 0; + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + } else { + if (mp->m_flags & XFS_MOUNT_RETERR) { + xfs_fs_cmn_err(CE_WARN, mp, +"stripe alignment turned off: sunit(%d) less than bsize(%d)", + mp->m_dalign, + mp->m_blockmask +1); + error = XFS_ERROR(EINVAL); + goto error1; + } + mp->m_swidth = 0; + } + } + + /* + * Update superblock with new values + * and log changes + */ + if (XFS_SB_VERSION_HASDALIGN(sbp)) { + if (sbp->sb_unit != mp->m_dalign) { + sbp->sb_unit = mp->m_dalign; + update_flags |= XFS_SB_UNIT; + } + if (sbp->sb_width != mp->m_swidth) { + sbp->sb_width = mp->m_swidth; + update_flags |= XFS_SB_WIDTH; + } + } + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; + } + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + if (sbp->sb_imax_pct) { + __uint64_t icount; + + /* Make sure the maximum inode count is a multiple of the + * units we allocate inodes in. + */ + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + do_div(icount, mp->m_ialloc_blks); + mp->m_maxicount = (icount * mp->m_ialloc_blks) << + sbp->sb_inopblog; + } else + mp->m_maxicount = 0; + + mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); + + /* + * XFS uses the uuid from the superblock as the unique + * identifier for fsid. We can not use the uuid from the volume + * since a single partition filesystem is identical to a single + * partition volume/filesystem. + */ + if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && + (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { + if (xfs_uuid_mount(mp)) { + error = XFS_ERROR(EINVAL); + goto error1; + } + uuid_mounted=1; + ret64 = uuid_hash64(&sbp->sb_uuid); + memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64)); + } + + /* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + if (mp->m_flags & XFS_MOUNT_WSYNC) { + readio_log = XFS_WSYNC_READIO_LOG; + writeio_log = XFS_WSYNC_WRITEIO_LOG; + } else { + readio_log = XFS_READIO_LOG_LARGE; + writeio_log = XFS_WRITEIO_LOG_LARGE; + } + } else { + readio_log = mp->m_readio_log; + writeio_log = mp->m_writeio_log; + } + + /* + * Set the number of readahead buffers to use based on + * physical memory size. + */ + if (xfs_physmem <= 4096) /* <= 16MB */ + mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB; + else if (xfs_physmem <= 8192) /* <= 32MB */ + mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB; + else + mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32; + if (sbp->sb_blocklog > readio_log) { + mp->m_readio_log = sbp->sb_blocklog; + } else { + mp->m_readio_log = readio_log; + } + mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); + if (sbp->sb_blocklog > writeio_log) { + mp->m_writeio_log = sbp->sb_blocklog; + } else { + mp->m_writeio_log = writeio_log; + } + mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); + + /* + * Set the inode cluster size based on the physical memory + * size. This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + */ + if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */ + mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE; + } else { + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + } + /* + * Set whether we're using inode alignment. + */ + if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; + else + mp->m_inoalign_mask = 0; + /* + * If we are using stripe alignment, check whether + * the stripe unit is a multiple of the inode alignment + */ + if (mp->m_dalign && mp->m_inoalign_mask && + !(mp->m_dalign & mp->m_inoalign_mask)) + mp->m_sinoalign = mp->m_dalign; + else + mp->m_sinoalign = 0; + /* + * Check that the data (and log if separate) are an ok size. + */ + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { + cmn_err(CE_WARN, "XFS: size check 1 failed"); + error = XFS_ERROR(E2BIG); + goto error1; + } + error = xfs_read_buf(mp, mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp); + if (!error) { + xfs_buf_relse(bp); + } else { + cmn_err(CE_WARN, "XFS: size check 2 failed"); + if (error == ENOSPC) { + error = XFS_ERROR(E2BIG); + } + goto error1; + } + + if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && + mp->m_logdev_targp != mp->m_ddev_targp) { + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + cmn_err(CE_WARN, "XFS: size check 3 failed"); + error = XFS_ERROR(E2BIG); + goto error1; + } + error = xfs_read_buf(mp, mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp); + if (!error) { + xfs_buf_relse(bp); + } else { + cmn_err(CE_WARN, "XFS: size check 3 failed"); + if (error == ENOSPC) { + error = XFS_ERROR(E2BIG); + } + goto error1; + } + } + + /* + * Initialize realtime fields in the mount structure + */ + if ((error = xfs_rtmount_init(mp))) { + cmn_err(CE_WARN, "XFS: RT mount failed"); + goto error1; + } + + /* + * For client case we are done now + */ + if (mfsi_flags & XFS_MFSI_CLIENT) { + return(0); + } + + /* + * Copies the low order bits of the timestamp and the randomly + * set "sequence" number out of a UUID. + */ + uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + + /* + * The vfs structure needs to have a file system independent + * way of checking for the invariant file system ID. Since it + * can't look at mount structures it has a pointer to the data + * in the mount structure. + * + * File systems that don't support user level file handles (i.e. + * all of them except for XFS) will leave vfs_altfsid as NULL. + */ + vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid; + mp->m_dmevmask = 0; /* not persistent; set after each mount */ + + /* + * Select the right directory manager. + */ + mp->m_dirops = + XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ? + xfsv2_dirops : + xfsv1_dirops; + + /* + * Initialize directory manager's entries. + */ + XFS_DIR_MOUNT(mp); + + /* + * Initialize the attribute manager's entries. + */ + mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + + /* + * Initialize the precomputed transaction reservations values. + */ + xfs_trans_init(mp); + + /* + * Allocate and initialize the inode hash table for this + * file system. + */ + xfs_ihash_init(mp); + xfs_chash_init(mp); + + /* + * Allocate and initialize the per-ag data. + */ + init_rwsem(&mp->m_peraglock); + mp->m_perag = + kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); + + mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + + /* + * log's mount-time initialization. Perform 1st part recovery if needed + */ + if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + cmn_err(CE_WARN, "XFS: log mount failed"); + goto error2; + } + } else { /* No log has been defined */ + cmn_err(CE_WARN, "XFS: no log defined"); + XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto error2; + } + + /* + * Get and sanity-check the root inode. + * Save the pointer to it in the mount structure. + */ + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); + if (error) { + cmn_err(CE_WARN, "XFS: failed to read root inode"); + goto error3; + } + + ASSERT(rip != NULL); + rvp = XFS_ITOV(rip); + + if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + cmn_err(CE_WARN, "XFS: corrupted root inode"); + prdev("Root inode %llu is not a directory", + mp->m_ddev_targp, (unsigned long long)rip->i_ino); + xfs_iunlock(rip, XFS_ILOCK_EXCL); + XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, + mp); + error = XFS_ERROR(EFSCORRUPTED); + goto error4; + } + mp->m_rootip = rip; /* save it */ + + xfs_iunlock(rip, XFS_ILOCK_EXCL); + + /* + * Initialize realtime inode pointers in the mount structure + */ + if ((error = xfs_rtmount_inodes(mp))) { + /* + * Free up the root inode. + */ + cmn_err(CE_WARN, "XFS: failed to read RT inodes"); + goto error4; + } + + /* + * If fs is not mounted readonly, then update the superblock + * unit and width changes. + */ + if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY)) + xfs_mount_log_sbunit(mp, update_flags); + + /* + * Initialise the XFS quota management subsystem for this mount + */ + if ((error = XFS_QM_INIT(mp, "amount, "aflags))) + goto error4; + + /* + * Finish recovering the file system. This part needed to be + * delayed until after the root and real-time bitmap inodes + * were consistently read in. + */ + error = xfs_log_mount_finish(mp, mfsi_flags); + if (error) { + cmn_err(CE_WARN, "XFS: log mount finish failed"); + goto error4; + } + + /* + * Complete the quota initialisation, post-log-replay component. + */ + if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) + goto error4; + + return 0; + + error4: + /* + * Free up the root inode. + */ + VN_RELE(rvp); + error3: + xfs_log_unmount_dealloc(mp); + error2: + xfs_ihash_free(mp); + xfs_chash_free(mp); + for (agno = 0; agno < sbp->sb_agcount; agno++) + if (mp->m_perag[agno].pagb_list) + kmem_free(mp->m_perag[agno].pagb_list, + sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS); + kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t)); + mp->m_perag = NULL; + /* FALLTHROUGH */ + error1: + if (uuid_mounted) + xfs_uuid_unmount(mp); + xfs_freesb(mp); + return error; +} + +/* + * xfs_unmountfs + * + * This flushes out the inodes,dquots and the superblock, unmounts the + * log and makes sure that incore structures are freed. + */ +int +xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) +{ + struct vfs *vfsp = XFS_MTOVFS(mp); +#if defined(DEBUG) || defined(INDUCE_IO_ERROR) + int64_t fsid; +#endif + + xfs_iflush_all(mp, XFS_FLUSH_ALL); + + XFS_QM_DQPURGEALL(mp, + XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING); + + /* + * Flush out the log synchronously so that we know for sure + * that nothing is pinned. This is important because bflush() + * will skip pinned buffers. + */ + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + + xfs_binval(mp->m_ddev_targp); + if (mp->m_rtdev_targp) { + xfs_binval(mp->m_rtdev_targp); + } + + xfs_unmountfs_writesb(mp); + + xfs_unmountfs_wait(mp); /* wait for async bufs */ + + xfs_log_unmount(mp); /* Done! No more fs ops. */ + + xfs_freesb(mp); + + /* + * All inodes from this mount point should be freed. + */ + ASSERT(mp->m_inodes == NULL); + + /* + * We may have bufs that are in the process of getting written still. + * We must wait for the I/O completion of those. The sync flag here + * does a two pass iteration thru the bufcache. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */ + } + + xfs_unmountfs_close(mp, cr); + if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) + xfs_uuid_unmount(mp); + +#if defined(DEBUG) || defined(INDUCE_IO_ERROR) + /* + * clear all error tags on this filesystem + */ + memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t)); + xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0); +#endif + XFS_IODONE(vfsp); + xfs_mount_free(mp, 1); + return 0; +} + +void +xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) +{ + if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp, 1); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp, 1); + xfs_free_buftarg(mp->m_ddev_targp, 0); +} + +void +xfs_unmountfs_wait(xfs_mount_t *mp) +{ + if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_wait_buftarg(mp->m_rtdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); +} + +int +xfs_unmountfs_writesb(xfs_mount_t *mp) +{ + xfs_buf_t *sbp; + xfs_sb_t *sb; + int error = 0; + + /* + * skip superblock write if fs is read-only, or + * if we are doing a forced umount. + */ + sbp = xfs_getsb(mp, 0); + if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || + XFS_FORCED_SHUTDOWN(mp))) { + /* + * mark shared-readonly if desired + */ + sb = XFS_BUF_TO_SBP(sbp); + if (mp->m_mk_sharedro) { + if (!(sb->sb_flags & XFS_SBF_READONLY)) + sb->sb_flags |= XFS_SBF_READONLY; + if (!XFS_SB_VERSION_HASSHARED(sb)) + XFS_SB_VERSION_ADDSHARED(sb); + xfs_fs_cmn_err(CE_NOTE, mp, + "Unmounting, marking shared read-only"); + } + XFS_BUF_UNDONE(sbp); + XFS_BUF_UNREAD(sbp); + XFS_BUF_UNDELAYWRITE(sbp); + XFS_BUF_WRITE(sbp); + XFS_BUF_UNASYNC(sbp); + ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); + xfsbdstrat(mp, sbp); + /* Nevermind errors we might get here. */ + error = xfs_iowait(sbp); + if (error) + xfs_ioerror_alert("xfs_unmountfs_writesb", + mp, sbp, XFS_BUF_ADDR(sbp)); + if (error && mp->m_mk_sharedro) + xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); + } + xfs_buf_relse(sbp); + return (error); +} + +/* + * xfs_mod_sb() can be used to copy arbitrary changes to the + * in-core superblock into the superblock buffer to be logged. + * It does not provide the higher level of locking that is + * needed to protect the in-core superblock from concurrent + * access. + */ +void +xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +{ + xfs_buf_t *bp; + int first; + int last; + xfs_mount_t *mp; + xfs_sb_t *sbp; + xfs_sb_field_t f; + + ASSERT(fields); + if (!fields) + return; + mp = tp->t_mountp; + bp = xfs_trans_getsb(tp, mp, 0); + sbp = XFS_BUF_TO_SBP(bp); + first = sizeof(xfs_sb_t); + last = 0; + + /* translate/copy */ + + xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields); + + /* find modified range */ + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; + + xfs_trans_log_buf(tp, bp, first, last); +} + +/* + * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply + * a delta to a specified field in the in-core superblock. Simply + * switch on the field indicated and apply the delta to that field. + * Fields are not allowed to dip below zero, so if the delta would + * do this do not apply it and return EINVAL. + * + * The SB_LOCK must be held when this routine is called. + */ +STATIC int +xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, + int delta, int rsvd) +{ + int scounter; /* short counter for 32 bit fields */ + long long lcounter; /* long counter for 64 bit fields */ + long long res_used, rem; + + /* + * With the in-core superblock spin lock held, switch + * on the indicated field. Apply the delta to the + * proper field. If the fields value would dip below + * 0, then do not apply the delta and return EINVAL. + */ + switch (field) { + case XFS_SBS_ICOUNT: + lcounter = (long long)mp->m_sb.sb_icount; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_icount = lcounter; + return (0); + case XFS_SBS_IFREE: + lcounter = (long long)mp->m_sb.sb_ifree; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_ifree = lcounter; + return (0); + case XFS_SBS_FDBLOCKS: + + lcounter = (long long)mp->m_sb.sb_fdblocks; + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (delta > 0) { /* Putting blocks back */ + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + rem = delta - res_used; + mp->m_resblks_avail = mp->m_resblks; + lcounter += rem; + } + } else { /* Taking blocks away */ + + lcounter += delta; + + /* + * If were out of blocks, use any available reserved blocks if + * were allowed to. + */ + + if (lcounter < 0) { + if (rsvd) { + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter < 0) { + return (XFS_ERROR(ENOSPC)); + } + mp->m_resblks_avail = lcounter; + return (0); + } else { /* not reserved */ + return (XFS_ERROR(ENOSPC)); + } + } + } + + mp->m_sb.sb_fdblocks = lcounter; + return (0); + case XFS_SBS_FREXTENTS: + lcounter = (long long)mp->m_sb.sb_frextents; + lcounter += delta; + if (lcounter < 0) { + return (XFS_ERROR(ENOSPC)); + } + mp->m_sb.sb_frextents = lcounter; + return (0); + case XFS_SBS_DBLOCKS: + lcounter = (long long)mp->m_sb.sb_dblocks; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_dblocks = lcounter; + return (0); + case XFS_SBS_AGCOUNT: + scounter = mp->m_sb.sb_agcount; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_agcount = scounter; + return (0); + case XFS_SBS_IMAX_PCT: + scounter = mp->m_sb.sb_imax_pct; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_imax_pct = scounter; + return (0); + case XFS_SBS_REXTSIZE: + scounter = mp->m_sb.sb_rextsize; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_rextsize = scounter; + return (0); + case XFS_SBS_RBMBLOCKS: + scounter = mp->m_sb.sb_rbmblocks; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_rbmblocks = scounter; + return (0); + case XFS_SBS_RBLOCKS: + lcounter = (long long)mp->m_sb.sb_rblocks; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_rblocks = lcounter; + return (0); + case XFS_SBS_REXTENTS: + lcounter = (long long)mp->m_sb.sb_rextents; + lcounter += delta; + if (lcounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_rextents = lcounter; + return (0); + case XFS_SBS_REXTSLOG: + scounter = mp->m_sb.sb_rextslog; + scounter += delta; + if (scounter < 0) { + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } + mp->m_sb.sb_rextslog = scounter; + return (0); + default: + ASSERT(0); + return (XFS_ERROR(EINVAL)); + } +} + +/* + * xfs_mod_incore_sb() is used to change a field in the in-core + * superblock structure by the specified delta. This modification + * is protected by the SB_LOCK. Just use the xfs_mod_incore_sb_unlocked() + * routine to do the work. + */ +int +xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) +{ + unsigned long s; + int status; + + s = XFS_SB_LOCK(mp); + status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + XFS_SB_UNLOCK(mp, s); + return (status); +} + +/* + * xfs_mod_incore_sb_batch() is used to change more than one field + * in the in-core superblock structure at a time. This modification + * is protected by a lock internal to this module. The fields and + * changes to those fields are specified in the array of xfs_mod_sb + * structures passed in. + * + * Either all of the specified deltas will be applied or none of + * them will. If any modified field dips below 0, then all modifications + * will be backed out and EINVAL will be returned. + */ +int +xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) +{ + unsigned long s; + int status=0; + xfs_mod_sb_t *msbp; + + /* + * Loop through the array of mod structures and apply each + * individually. If any fail, then back out all those + * which have already been applied. Do all of this within + * the scope of the SB_LOCK so that all of the changes will + * be atomic. + */ + s = XFS_SB_LOCK(mp); + msbp = &msb[0]; + for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { + /* + * Apply the delta at index n. If it fails, break + * from the loop so we'll fall into the undo loop + * below. + */ + status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + msbp->msb_delta, rsvd); + if (status != 0) { + break; + } + } + + /* + * If we didn't complete the loop above, then back out + * any changes made to the superblock. If you add code + * between the loop above and here, make sure that you + * preserve the value of status. Loop back until + * we step below the beginning of the array. Make sure + * we don't touch anything back there. + */ + if (status != 0) { + msbp--; + while (msbp >= msb) { + status = xfs_mod_incore_sb_unlocked(mp, + msbp->msb_field, -(msbp->msb_delta), rsvd); + ASSERT(status == 0); + msbp--; + } + } + XFS_SB_UNLOCK(mp, s); + return (status); +} + +/* + * xfs_getsb() is called to obtain the buffer for the superblock. + * The buffer is returned locked and read in from disk. + * The buffer should be released with a call to xfs_brelse(). + * + * If the flags parameter is BUF_TRYLOCK, then we'll only return + * the superblock buffer if it can be locked without sleeping. + * If it can't then we'll return NULL. + */ +xfs_buf_t * +xfs_getsb( + xfs_mount_t *mp, + int flags) +{ + xfs_buf_t *bp; + + ASSERT(mp->m_sb_bp != NULL); + bp = mp->m_sb_bp; + if (flags & XFS_BUF_TRYLOCK) { + if (!XFS_BUF_CPSEMA(bp)) { + return NULL; + } + } else { + XFS_BUF_PSEMA(bp, PRIBIO); + } + XFS_BUF_HOLD(bp); + ASSERT(XFS_BUF_ISDONE(bp)); + return (bp); +} + +/* + * Used to free the superblock along various error paths. + */ +void +xfs_freesb( + xfs_mount_t *mp) +{ + xfs_buf_t *bp; + + /* + * Use xfs_getsb() so that the buffer will be locked + * when we call xfs_buf_relse(). + */ + bp = xfs_getsb(mp, 0); + XFS_BUF_UNMANAGE(bp); + xfs_buf_relse(bp); + mp->m_sb_bp = NULL; +} + +/* + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. + */ +STATIC int +xfs_uuid_mount( + xfs_mount_t *mp) +{ + if (uuid_is_nil(&mp->m_sb.sb_uuid)) { + cmn_err(CE_WARN, + "XFS: Filesystem %s has nil UUID - can't mount", + mp->m_fsname); + return -1; + } + if (!uuid_table_insert(&mp->m_sb.sb_uuid)) { + cmn_err(CE_WARN, + "XFS: Filesystem %s has duplicate UUID - can't mount", + mp->m_fsname); + return -1; + } + return 0; +} + +/* + * Remove filesystem from the UUID table. + */ +STATIC void +xfs_uuid_unmount( + xfs_mount_t *mp) +{ + uuid_table_remove(&mp->m_sb.sb_uuid); +} + +/* + * Used to log changes to the superblock unit and width fields which could + * be altered by the mount options. Only the first superblock is updated. + */ +STATIC void +xfs_mount_log_sbunit( + xfs_mount_t *mp, + __int64_t fields) +{ + xfs_trans_t *tp; + + ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID)); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); + if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT)) { + xfs_trans_cancel(tp, 0); + return; + } + xfs_mod_sb(tp, fields); + xfs_trans_commit(tp, 0, NULL); +} diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h new file mode 100644 index 00000000000..5fc6201dd8e --- /dev/null +++ b/fs/xfs/xfs_mount.h @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_MOUNT_H__ +#define __XFS_MOUNT_H__ + + +typedef struct xfs_trans_reservations { + uint tr_write; /* extent alloc trans */ + uint tr_itruncate; /* truncate trans */ + uint tr_rename; /* rename trans */ + uint tr_link; /* link trans */ + uint tr_remove; /* unlink trans */ + uint tr_symlink; /* symlink trans */ + uint tr_create; /* create trans */ + uint tr_mkdir; /* mkdir trans */ + uint tr_ifree; /* inode free trans */ + uint tr_ichange; /* inode update trans */ + uint tr_growdata; /* fs data section grow trans */ + uint tr_swrite; /* sync write inode trans */ + uint tr_addafork; /* cvt inode to attributed trans */ + uint tr_writeid; /* write setuid/setgid file */ + uint tr_attrinval; /* attr fork buffer invalidation */ + uint tr_attrset; /* set/create an attribute */ + uint tr_attrrm; /* remove an attribute */ + uint tr_clearagi; /* clear bad agi unlinked ino bucket */ + uint tr_growrtalloc; /* grow realtime allocations */ + uint tr_growrtzero; /* grow realtime zeroing */ + uint tr_growrtfree; /* grow realtime freeing */ +} xfs_trans_reservations_t; + + +#ifndef __KERNEL__ +/* + * Moved here from xfs_ag.h to avoid reordering header files + */ +#define XFS_DADDR_TO_AGNO(mp,d) \ + ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) +#define XFS_DADDR_TO_AGBNO(mp,d) \ + ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) +#else +struct cred; +struct log; +struct vfs; +struct vnode; +struct xfs_mount_args; +struct xfs_ihash; +struct xfs_chash; +struct xfs_inode; +struct xfs_perag; +struct xfs_iocore; +struct xfs_bmbt_irec; +struct xfs_bmap_free; + +#define AIL_LOCK_T lock_t +#define AIL_LOCKINIT(x,y) spinlock_init(x,y) +#define AIL_LOCK_DESTROY(x) spinlock_destroy(x) +#define AIL_LOCK(mp,s) s=mutex_spinlock(&(mp)->m_ail_lock) +#define AIL_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_ail_lock, s) + + +/* + * Prototypes and functions for the Data Migration subsystem. + */ + +typedef int (*xfs_send_data_t)(int, struct vnode *, + xfs_off_t, size_t, int, vrwlock_t *); +typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); +typedef int (*xfs_send_destroy_t)(struct vnode *, dm_right_t); +typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *, + struct vnode *, + dm_right_t, struct vnode *, dm_right_t, + char *, char *, mode_t, int, int); +typedef void (*xfs_send_unmount_t)(struct vfs *, struct vnode *, + dm_right_t, mode_t, int, int); + +typedef struct xfs_dmops { + xfs_send_data_t xfs_send_data; + xfs_send_mmap_t xfs_send_mmap; + xfs_send_destroy_t xfs_send_destroy; + xfs_send_namesp_t xfs_send_namesp; + xfs_send_unmount_t xfs_send_unmount; +} xfs_dmops_t; + +#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \ + (*(mp)->m_dm_ops.xfs_send_data)(ev,vp,off,len,fl,lock) +#define XFS_SEND_MMAP(mp, vma,fl) \ + (*(mp)->m_dm_ops.xfs_send_mmap)(vma,fl) +#define XFS_SEND_DESTROY(mp, vp,right) \ + (*(mp)->m_dm_ops.xfs_send_destroy)(vp,right) +#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ + (*(mp)->m_dm_ops.xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) +#define XFS_SEND_PREUNMOUNT(mp, vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ + (*(mp)->m_dm_ops.xfs_send_namesp)(DM_EVENT_PREUNMOUNT,vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) +#define XFS_SEND_UNMOUNT(mp, vfsp,vp,right,mode,rval,fl) \ + (*(mp)->m_dm_ops.xfs_send_unmount)(vfsp,vp,right,mode,rval,fl) + + +/* + * Prototypes and functions for the Quota Management subsystem. + */ + +struct xfs_dquot; +struct xfs_dqtrxops; +struct xfs_quotainfo; + +typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); +typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int); +typedef int (*xfs_qmunmount_t)(struct xfs_mount *); +typedef void (*xfs_qmdone_t)(struct xfs_mount *); +typedef void (*xfs_dqrele_t)(struct xfs_dquot *); +typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint); +typedef void (*xfs_dqdetach_t)(struct xfs_inode *); +typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint); +typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *, + struct xfs_inode *, uid_t, gid_t, uint, + struct xfs_dquot **, struct xfs_dquot **); +typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *); +typedef int (*xfs_dqvoprename_t)(struct xfs_inode **); +typedef struct xfs_dquot * (*xfs_dqvopchown_t)( + struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot **, struct xfs_dquot *); +typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, uint); + +typedef struct xfs_qmops { + xfs_qminit_t xfs_qminit; + xfs_qmdone_t xfs_qmdone; + xfs_qmmount_t xfs_qmmount; + xfs_qmunmount_t xfs_qmunmount; + xfs_dqrele_t xfs_dqrele; + xfs_dqattach_t xfs_dqattach; + xfs_dqdetach_t xfs_dqdetach; + xfs_dqpurgeall_t xfs_dqpurgeall; + xfs_dqvopalloc_t xfs_dqvopalloc; + xfs_dqvopcreate_t xfs_dqvopcreate; + xfs_dqvoprename_t xfs_dqvoprename; + xfs_dqvopchown_t xfs_dqvopchown; + xfs_dqvopchownresv_t xfs_dqvopchownresv; + struct xfs_dqtrxops *xfs_dqtrxops; +} xfs_qmops_t; + +#define XFS_QM_INIT(mp, mnt, fl) \ + (*(mp)->m_qm_ops.xfs_qminit)(mp, mnt, fl) +#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \ + (*(mp)->m_qm_ops.xfs_qmmount)(mp, mnt, fl, mfsi_flags) +#define XFS_QM_UNMOUNT(mp) \ + (*(mp)->m_qm_ops.xfs_qmunmount)(mp) +#define XFS_QM_DONE(mp) \ + (*(mp)->m_qm_ops.xfs_qmdone)(mp) +#define XFS_QM_DQRELE(mp, dq) \ + (*(mp)->m_qm_ops.xfs_dqrele)(dq) +#define XFS_QM_DQATTACH(mp, ip, fl) \ + (*(mp)->m_qm_ops.xfs_dqattach)(ip, fl) +#define XFS_QM_DQDETACH(mp, ip) \ + (*(mp)->m_qm_ops.xfs_dqdetach)(ip) +#define XFS_QM_DQPURGEALL(mp, fl) \ + (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl) +#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, fl, dq1, dq2) \ + (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, fl, dq1, dq2) +#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \ + (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2) +#define XFS_QM_DQVOPRENAME(mp, ip) \ + (*(mp)->m_qm_ops.xfs_dqvoprename)(ip) +#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \ + (*(mp)->m_qm_ops.xfs_dqvopchown)(tp, ip, dqp, dq) +#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \ + (*(mp)->m_qm_ops.xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl) + + +/* + * Prototypes and functions for I/O core modularization. + */ + +typedef int (*xfs_ioinit_t)(struct vfs *, + struct xfs_mount_args *, int); +typedef int (*xfs_bmapi_t)(struct xfs_trans *, void *, + xfs_fileoff_t, xfs_filblks_t, int, + xfs_fsblock_t *, xfs_extlen_t, + struct xfs_bmbt_irec *, int *, + struct xfs_bmap_free *); +typedef int (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *); +typedef int (*xfs_iomap_write_direct_t)( + void *, loff_t, size_t, int, + struct xfs_bmbt_irec *, int *, int); +typedef int (*xfs_iomap_write_delay_t)( + void *, loff_t, size_t, int, + struct xfs_bmbt_irec *, int *); +typedef int (*xfs_iomap_write_allocate_t)( + void *, struct xfs_bmbt_irec *, int *); +typedef int (*xfs_iomap_write_unwritten_t)( + void *, loff_t, size_t); +typedef uint (*xfs_lck_map_shared_t)(void *); +typedef void (*xfs_lock_t)(void *, uint); +typedef void (*xfs_lock_demote_t)(void *, uint); +typedef int (*xfs_lock_nowait_t)(void *, uint); +typedef void (*xfs_unlk_t)(void *, unsigned int); +typedef xfs_fsize_t (*xfs_size_t)(void *); +typedef xfs_fsize_t (*xfs_iodone_t)(struct vfs *); + +typedef struct xfs_ioops { + xfs_ioinit_t xfs_ioinit; + xfs_bmapi_t xfs_bmapi_func; + xfs_bmap_eof_t xfs_bmap_eof_func; + xfs_iomap_write_direct_t xfs_iomap_write_direct; + xfs_iomap_write_delay_t xfs_iomap_write_delay; + xfs_iomap_write_allocate_t xfs_iomap_write_allocate; + xfs_iomap_write_unwritten_t xfs_iomap_write_unwritten; + xfs_lock_t xfs_ilock; + xfs_lck_map_shared_t xfs_lck_map_shared; + xfs_lock_demote_t xfs_ilock_demote; + xfs_lock_nowait_t xfs_ilock_nowait; + xfs_unlk_t xfs_unlock; + xfs_size_t xfs_size_func; + xfs_iodone_t xfs_iodone; +} xfs_ioops_t; + +#define XFS_IOINIT(vfsp, args, flags) \ + (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags) +#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist) \ + (*(mp)->m_io_ops.xfs_bmapi_func) \ + (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist) +#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \ + (*(mp)->m_io_ops.xfs_bmap_eof_func) \ + ((io)->io_obj, endoff, whichfork, eof) +#define XFS_IOMAP_WRITE_DIRECT(mp, io, offset, count, flags, mval, nmap, found)\ + (*(mp)->m_io_ops.xfs_iomap_write_direct) \ + ((io)->io_obj, offset, count, flags, mval, nmap, found) +#define XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, flags, mval, nmap) \ + (*(mp)->m_io_ops.xfs_iomap_write_delay) \ + ((io)->io_obj, offset, count, flags, mval, nmap) +#define XFS_IOMAP_WRITE_ALLOCATE(mp, io, mval, nmap) \ + (*(mp)->m_io_ops.xfs_iomap_write_allocate) \ + ((io)->io_obj, mval, nmap) +#define XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count) \ + (*(mp)->m_io_ops.xfs_iomap_write_unwritten) \ + ((io)->io_obj, offset, count) +#define XFS_LCK_MAP_SHARED(mp, io) \ + (*(mp)->m_io_ops.xfs_lck_map_shared)((io)->io_obj) +#define XFS_ILOCK(mp, io, mode) \ + (*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode) +#define XFS_ILOCK_NOWAIT(mp, io, mode) \ + (*(mp)->m_io_ops.xfs_ilock_nowait)((io)->io_obj, mode) +#define XFS_IUNLOCK(mp, io, mode) \ + (*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode) +#define XFS_ILOCK_DEMOTE(mp, io, mode) \ + (*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode) +#define XFS_SIZE(mp, io) \ + (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj) +#define XFS_IODONE(vfsp) \ + (*(mp)->m_io_ops.xfs_iodone)(vfsp) + + +typedef struct xfs_mount { + bhv_desc_t m_bhv; /* vfs xfs behavior */ + xfs_tid_t m_tid; /* next unused tid for fs */ + AIL_LOCK_T m_ail_lock; /* fs AIL mutex */ + xfs_ail_entry_t m_ail; /* fs active log item list */ + uint m_ail_gen; /* fs AIL generation count */ + xfs_sb_t m_sb; /* copy of fs superblock */ + lock_t m_sb_lock; /* sb counter mutex */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ + char *m_fsname; /* filesystem name */ + int m_fsname_len; /* strlen of fs name */ + int m_bsize; /* fs logical block size */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + lock_t m_agirotor_lock;/* .. and lock protecting it */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_ihsize; /* size of next field */ + struct xfs_ihash *m_ihash; /* fs private inode hash table*/ + struct xfs_inode *m_inodes; /* active inode list */ + struct list_head m_del_inodes; /* inodes to reclaim */ + mutex_t m_ilock; /* inode list mutex */ + uint m_ireclaims; /* count of calls to reclaim*/ + uint m_readio_log; /* min read size log bytes */ + uint m_readio_blocks; /* min read size blocks */ + uint m_writeio_log; /* min write size log bytes */ + uint m_writeio_blocks; /* min write size blocks */ + struct log *m_log; /* log specific stuff */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ + struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ + struct xfs_inode *m_rsumip; /* pointer to summary inode */ + struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_quotainfo *m_quotainfo; /* disk quota information */ + xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ + xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ + xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ +#define m_dev m_ddev_targp->pbr_dev + __uint8_t m_dircook_elog; /* log d-cookie entry bits */ + __uint8_t m_blkbit_log; /* blocklog + NBBY */ + __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + __uint8_t m_agno_log; /* log #ag's */ + __uint8_t m_agino_log; /* #bits for agino in inum */ + __uint8_t m_nreadaheads; /* #readahead buffers */ + __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_blockmask; /* sb_blocksize-1 */ + uint m_blockwsize; /* sb_blocksize in words */ + uint m_blockwmask; /* blockwsize-1 */ + uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */ + uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */ + uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */ + uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */ + uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */ + uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */ + uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ + uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ + struct xfs_perag *m_perag; /* per-ag accounting info */ + struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + sema_t m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ + uint m_flags; /* global mount flags */ + uint m_attroffset; /* inode attribute offset */ + uint m_dir_node_ents; /* #entries in a dir danode */ + uint m_attr_node_ents; /* #entries in attr danode */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_litino; /* size of inode union area */ + int m_inoalign_mask;/* mask sb_inoalignmt if used */ + uint m_qflags; /* quota status flags */ + xfs_trans_reservations_t m_reservations;/* precomputed res values */ + __uint64_t m_maxicount; /* maximum inode count */ + __uint64_t m_maxioffset; /* maximum inode offset */ + __uint64_t m_resblks; /* total reserved blocks */ + __uint64_t m_resblks_avail;/* available reserved blocks */ +#if XFS_BIG_INUMS + xfs_ino_t m_inoadd; /* add value for ino64_offset */ +#endif + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + int m_sinoalign; /* stripe unit inode alignmnt */ + int m_attr_magicpct;/* 37% of the blocksize */ + int m_dir_magicpct; /* 37% of the dir blocksize */ + __uint8_t m_mk_sharedro; /* mark shared ro on unmount */ + __uint8_t m_inode_quiesce;/* call quiesce on new inodes. + field governed by m_ilock */ + __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + __uint8_t m_dirversion; /* 1 or 2 */ + xfs_dirops_t m_dirops; /* table of dir funcs */ + int m_dirblksize; /* directory block sz--bytes */ + int m_dirblkfsbs; /* directory block sz--fsbs */ + xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ + xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ + xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ + uint m_chsize; /* size of next field */ + struct xfs_chash *m_chash; /* fs private inode per-cluster + * hash table */ + struct xfs_dmops m_dm_ops; /* vector of DMI ops */ + struct xfs_qmops m_qm_ops; /* vector of XQM ops */ + struct xfs_ioops m_io_ops; /* vector of I/O ops */ + atomic_t m_active_trans; /* number trans frozen */ +} xfs_mount_t; + +/* + * Flags for m_flags. + */ +#define XFS_MOUNT_WSYNC 0x00000001 /* for nfs - all metadata ops + must be synchronous except + for space allocations */ +#define XFS_MOUNT_INO64 0x00000002 + /* 0x00000004 -- currently unused */ + /* 0x00000008 -- currently unused */ +#define XFS_MOUNT_FS_SHUTDOWN 0x00000010 /* atomic stop of all filesystem + operations, typically for + disk errors in metadata */ +#define XFS_MOUNT_NOATIME 0x00000020 /* don't modify inode access + times on reads */ +#define XFS_MOUNT_RETERR 0x00000040 /* return alignment errors to + user */ +#define XFS_MOUNT_NOALIGN 0x00000080 /* turn off stripe alignment + allocations */ + /* 0x00000100 -- currently unused */ + /* 0x00000200 -- currently unused */ +#define XFS_MOUNT_NORECOVERY 0x00000400 /* no recovery - dirty fs */ +#define XFS_MOUNT_SHARED 0x00000800 /* shared mount */ +#define XFS_MOUNT_DFLT_IOSIZE 0x00001000 /* set default i/o size */ +#define XFS_MOUNT_OSYNCISOSYNC 0x00002000 /* o_sync is REALLY o_sync */ + /* osyncisdsync is now default*/ +#define XFS_MOUNT_32BITINODES 0x00004000 /* do not create inodes above + * 32 bits in size */ +#define XFS_MOUNT_32BITINOOPT 0x00008000 /* saved mount option state */ +#define XFS_MOUNT_NOUUID 0x00010000 /* ignore uuid during mount */ +#define XFS_MOUNT_NOLOGFLUSH 0x00020000 +#define XFS_MOUNT_IDELETE 0x00040000 /* delete empty inode clusters*/ +#define XFS_MOUNT_SWALLOC 0x00080000 /* turn on stripe width + * allocation */ +#define XFS_MOUNT_IHASHSIZE 0x00100000 /* inode hash table size */ +#define XFS_MOUNT_DIRSYNC 0x00200000 /* synchronous directory ops */ + +/* + * Default minimum read and write sizes. + */ +#define XFS_READIO_LOG_LARGE 16 +#define XFS_WRITEIO_LOG_LARGE 16 + +/* + * Max and min values for UIO and mount-option defined I/O sizes; + * min value can't be less than a page. Currently unused. + */ +#define XFS_MAX_IO_LOG 16 /* 64K */ +#define XFS_MIN_IO_LOG PAGE_SHIFT + +/* + * Synchronous read and write sizes. This should be + * better for NFSv2 wsync filesystems. + */ +#define XFS_WSYNC_READIO_LOG 15 /* 32K */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ + +#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) + +#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +#define xfs_force_shutdown(m,f) \ + VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__) + +/* + * Flags sent to xfs_force_shutdown. + */ +#define XFS_METADATA_IO_ERROR 0x1 +#define XFS_LOG_IO_ERROR 0x2 +#define XFS_FORCE_UMOUNT 0x4 +#define XFS_CORRUPT_INCORE 0x8 /* Corrupt in-memory data structures */ +#define XFS_SHUTDOWN_REMOTE_REQ 0x10 /* Shutdown came from remote cell */ + +/* + * xflags for xfs_syncsub + */ +#define XFS_XSYNC_RELOC 0x01 + +/* + * Flags for xfs_mountfs + */ +#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ +#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ +#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ + /* log recovery */ +#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ + +/* + * Macros for getting from mount to vfs and back. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MTOVFS) +struct vfs *xfs_mtovfs(xfs_mount_t *mp); +#define XFS_MTOVFS(mp) xfs_mtovfs(mp) +#else +#define XFS_MTOVFS(mp) (bhvtovfs(&(mp)->m_bhv)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOM) +xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp); +#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp) +#else +#define XFS_BHVTOM(bdp) ((xfs_mount_t *)BHV_PDATA(bdp)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_VFSTOM) +xfs_mount_t *xfs_vfstom(vfs_t *vfs); +#define XFS_VFSTOM(vfs) xfs_vfstom(vfs) +#else +#define XFS_VFSTOM(vfs) \ + (XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops))) +#endif + + +/* + * Moved here from xfs_ag.h to avoid reordering header files + */ + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGNO) +xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d); +#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) +#else + +static inline xfs_agnumber_t XFS_DADDR_TO_AGNO(xfs_mount_t *mp, xfs_daddr_t d) +{ + d = XFS_BB_TO_FSBT(mp, d); + do_div(d, mp->m_sb.sb_agblocks); + return (xfs_agnumber_t) d; +} + +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGBNO) +xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d); +#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d) +#else + +static inline xfs_agblock_t XFS_DADDR_TO_AGBNO(xfs_mount_t *mp, xfs_daddr_t d) +{ + d = XFS_BB_TO_FSBT(mp, d); + return (xfs_agblock_t) do_div(d, mp->m_sb.sb_agblocks); +} + +#endif + +/* + * This structure is for use by the xfs_mod_incore_sb_batch() routine. + */ +typedef struct xfs_mod_sb { + xfs_sb_field_t msb_field; /* Field to modify, see below */ + int msb_delta; /* Change to make to specified field */ +} xfs_mod_sb_t; + +#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock), PINOD) +#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) +#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock) +#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s)) + +extern xfs_mount_t *xfs_mount_init(void); +extern void xfs_mod_sb(xfs_trans_t *, __int64_t); +extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); +extern int xfs_mountfs(struct vfs *, xfs_mount_t *mp, int); + +extern int xfs_unmountfs(xfs_mount_t *, struct cred *); +extern void xfs_unmountfs_wait(xfs_mount_t *); +extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); +extern int xfs_unmountfs_writesb(xfs_mount_t *); +extern int xfs_unmount_flush(xfs_mount_t *, int); +extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int); +extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, + uint, int); +extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); +extern int xfs_readsb(xfs_mount_t *mp); +extern void xfs_freesb(xfs_mount_t *); +extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); +extern int xfs_syncsub(xfs_mount_t *, int, int, int *); +extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t); +extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); + +extern struct vfsops xfs_vfsops; +extern struct vnodeops xfs_vnodeops; + +extern struct xfs_dmops xfs_dmcore_stub; +extern struct xfs_qmops xfs_qmcore_stub; +extern struct xfs_ioops xfs_iocore_xfs; + +extern int xfs_init(void); +extern void xfs_cleanup(void); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c new file mode 100644 index 00000000000..4f40c92863d --- /dev/null +++ b/fs/xfs/xfs_qmops.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#include "xfs.h" + +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + + +STATIC struct xfs_dquot * +xfs_dqvopchown_default( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot **dqp, + struct xfs_dquot *dq) +{ + return NULL; +} + +xfs_qmops_t xfs_qmcore_stub = { + .xfs_qminit = (xfs_qminit_t) fs_noerr, + .xfs_qmdone = (xfs_qmdone_t) fs_noerr, + .xfs_qmmount = (xfs_qmmount_t) fs_noerr, + .xfs_qmunmount = (xfs_qmunmount_t) fs_noerr, + .xfs_dqrele = (xfs_dqrele_t) fs_noerr, + .xfs_dqattach = (xfs_dqattach_t) fs_noerr, + .xfs_dqdetach = (xfs_dqdetach_t) fs_noerr, + .xfs_dqpurgeall = (xfs_dqpurgeall_t) fs_noerr, + .xfs_dqvopalloc = (xfs_dqvopalloc_t) fs_noerr, + .xfs_dqvopcreate = (xfs_dqvopcreate_t) fs_noerr, + .xfs_dqvoprename = (xfs_dqvoprename_t) fs_noerr, + .xfs_dqvopchown = xfs_dqvopchown_default, + .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, +}; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h new file mode 100644 index 00000000000..703ec4efcb4 --- /dev/null +++ b/fs/xfs/xfs_quota.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_QUOTA_H__ +#define __XFS_QUOTA_H__ + +/* + * The ondisk form of a dquot structure. + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __int32_t xfs_dqid_t; + +/* + * Eventhough users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { +/*16*/ u_int16_t d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ +/*8 */ u_int8_t d_version; /* dquot version */ +/*8 */ u_int8_t d_flags; /* XFS_DQ_USER/PROJ/GROUP */ +/*32*/ xfs_dqid_t d_id; /* user,project,group id */ +/*64*/ xfs_qcnt_t d_blk_hardlimit;/* absolute limit on disk blks */ +/*64*/ xfs_qcnt_t d_blk_softlimit;/* preferred limit on disk blks */ +/*64*/ xfs_qcnt_t d_ino_hardlimit;/* maximum # allocated inodes */ +/*64*/ xfs_qcnt_t d_ino_softlimit;/* preferred inode limit */ +/*64*/ xfs_qcnt_t d_bcount; /* disk blocks owned by the user */ +/*64*/ xfs_qcnt_t d_icount; /* inodes owned by the user */ +/*32*/ __int32_t d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ +/*32*/ __int32_t d_btimer; /* similar to above; for disk blocks */ +/*16*/ xfs_qwarncnt_t d_iwarns; /* warnings issued wrt num inodes */ +/*16*/ xfs_qwarncnt_t d_bwarns; /* warnings issued wrt disk blocks */ +/*32*/ __int32_t d_pad0; /* 64 bit align */ +/*64*/ xfs_qcnt_t d_rtb_hardlimit;/* absolute limit on realtime blks */ +/*64*/ xfs_qcnt_t d_rtb_softlimit;/* preferred limit on RT disk blks */ +/*64*/ xfs_qcnt_t d_rtbcount; /* realtime blocks owned */ +/*32*/ __int32_t d_rtbtimer; /* similar to above; for RT disk blocks */ +/*16*/ xfs_qwarncnt_t d_rtbwarns; /* warnings issued wrt RT disk blocks */ +/*16*/ __uint16_t d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[32]; /* filling for posterity */ +} xfs_dqblk_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +/* #define XFS_DQ_PROJ 0x0002 -- project quota (IRIX) */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */ +#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */ +#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */ +#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */ +#define XFS_DQ_MARKER 0x0080 /* sentinel */ + +/* + * In the worst case, when both user and group quotas are on, + * we can have a max of three dquots changing in a single transaction. + */ +#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) + + +/* + * These are the structures used to lay out dquots and quotaoff + * records on the log. Quite similar to those of inodes. + */ + +/* + * log format struct for dquots. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp id number : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* (IRIX) project quota accounting ON */ +#define XFS_GQUOTA_ENFD 0x0010 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0020 /* quotacheck run on grp quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x0080 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x0100 /* gquotas are being turned off */ + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_GQUOTA 0x0000008 /* group dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ +#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ +#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */ +#define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqflush and dqflush_all. + */ +#define XFS_QMOPT_SYNC 0x1000000 +#define XFS_QMOPT_ASYNC 0x2000000 +#define XFS_QMOPT_DELWRI 0x4000000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x8000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL (XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +#ifdef __KERNEL__ +/* + * This check is done typically without holding the inode lock; + * that may seem racey, but it is harmless in the context that it is used. + * The inode cannot go inactive as long a reference is kept, and + * therefore if dquot(s) were attached, they'll stay consistent. + * If, for example, the ownership of the inode changes while + * we didn't have the inode locked, the appropriate dquot(s) will be + * attached atomically. + */ +#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ + (ip)->i_udquot == NULL) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + (ip)->i_gdquot == NULL)) + +#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & \ + XFS_UQUOTA_CHKD) == 0) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & \ + XFS_GQUOTA_CHKD) == 0)) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD) +#define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE) + + +/* + * The structure kept inside the xfs_trans_t keep track of dquot changes + * within a transaction and apply them later. + */ +typedef struct xfs_dqtrx { + struct xfs_dquot *qt_dquot; /* the dquot this refers to */ + ulong qt_blk_res; /* blks reserved on a dquot */ + ulong qt_blk_res_used; /* blks used from the reservation */ + ulong qt_ino_res; /* inode reserved on a dquot */ + ulong qt_ino_res_used; /* inodes used from the reservation */ + long qt_bcount_delta; /* dquot blk count changes */ + long qt_delbcnt_delta; /* delayed dquot blk count changes */ + long qt_icount_delta; /* dquot inode count changes */ + ulong qt_rtblk_res; /* # blks reserved on a dquot */ + ulong qt_rtblk_res_used;/* # blks used from reservation */ + long qt_rtbcount_delta;/* dquot realtime blk changes */ + long qt_delrtb_delta; /* delayed RT blk count changes */ +} xfs_dqtrx_t; + +/* + * Dquot transaction functions, used if quota is enabled. + */ +typedef void (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *); +typedef void (*qo_mod_dquot_byino_t)(struct xfs_trans *, + struct xfs_inode *, uint, long); +typedef void (*qo_free_dqinfo_t)(struct xfs_trans *); +typedef void (*qo_apply_dquot_deltas_t)(struct xfs_trans *); +typedef void (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *); +typedef int (*qo_reserve_quota_nblks_t)( + struct xfs_trans *, struct xfs_mount *, + struct xfs_inode *, long, long, uint); +typedef int (*qo_reserve_quota_bydquots_t)( + struct xfs_trans *, struct xfs_mount *, + struct xfs_dquot *, struct xfs_dquot *, + long, long, uint); +typedef struct xfs_dqtrxops { + qo_dup_dqinfo_t qo_dup_dqinfo; + qo_free_dqinfo_t qo_free_dqinfo; + qo_mod_dquot_byino_t qo_mod_dquot_byino; + qo_apply_dquot_deltas_t qo_apply_dquot_deltas; + qo_reserve_quota_nblks_t qo_reserve_quota_nblks; + qo_reserve_quota_bydquots_t qo_reserve_quota_bydquots; + qo_unreserve_and_mod_dquots_t qo_unreserve_and_mod_dquots; +} xfs_dqtrxops_t; + +#define XFS_DQTRXOP(mp, tp, op, args...) \ + ((mp)->m_qm_ops.xfs_dqtrxops ? \ + ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : 0) + +#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \ + ((mp)->m_qm_ops.xfs_dqtrxops ? \ + ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : (void)0) + +#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \ + XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp) +#define XFS_TRANS_FREE_DQINFO(mp, tp) \ + XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo) +#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \ + XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta) +#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \ + XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas) +#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \ + XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl) +#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \ + XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl) +#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \ + XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots) + +#define XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, nblks) \ + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \ + XFS_QMOPT_RES_REGBLKS) +#define XFS_TRANS_RESERVE_BLKQUOTA_FORCE(mp, tp, ip, nblks) \ + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \ + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES) +#define XFS_TRANS_UNRESERVE_BLKQUOTA(mp, tp, ip, nblks) \ + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), 0, \ + XFS_QMOPT_RES_REGBLKS) +#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ + XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \ + f | XFS_QMOPT_RES_REGBLKS) +#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ + XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \ + f | XFS_QMOPT_RES_REGBLKS) + +extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); + +extern struct bhv_vfsops xfs_qmops; + +#endif /* __KERNEL__ */ + +#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h new file mode 100644 index 00000000000..cd8ddfd35d6 --- /dev/null +++ b/fs/xfs/xfs_refcache.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_REFCACHE_H__ +#define __XFS_REFCACHE_H__ + +#ifdef HAVE_REFCACHE +/* + * Maximum size (in inodes) for the NFS reference cache + */ +#define XFS_REFCACHE_SIZE_MAX 512 + +struct xfs_inode; +struct xfs_mount; + +extern void xfs_refcache_insert(struct xfs_inode *); +extern void xfs_refcache_purge_ip(struct xfs_inode *); +extern void xfs_refcache_purge_mp(struct xfs_mount *); +extern void xfs_refcache_purge_some(struct xfs_mount *); +extern void xfs_refcache_resize(int); +extern void xfs_refcache_destroy(void); + +extern void xfs_refcache_iunlock(struct xfs_inode *, uint); + +#else + +#define xfs_refcache_insert(ip) do { } while (0) +#define xfs_refcache_purge_ip(ip) do { } while (0) +#define xfs_refcache_purge_mp(mp) do { } while (0) +#define xfs_refcache_purge_some(mp) do { } while (0) +#define xfs_refcache_resize(size) do { } while (0) +#define xfs_refcache_destroy() do { } while (0) + +#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags) + +#endif + +#endif /* __XFS_REFCACHE_H__ */ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c new file mode 100644 index 00000000000..cb13f9a1d45 --- /dev/null +++ b/fs/xfs/xfs_rename.c @@ -0,0 +1,673 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_refcache.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_da_btree.h" +#include "xfs_dir_leaf.h" + + +/* + * Given an array of up to 4 inode pointers, unlock the pointed to inodes. + * If there are fewer than 4 entries in the array, the empty entries will + * be at the end and will have NULL pointers in them. + */ +STATIC void +xfs_rename_unlock4( + xfs_inode_t **i_tab, + uint lock_mode) +{ + int i; + + xfs_iunlock(i_tab[0], lock_mode); + for (i = 1; i < 4; i++) { + if (i_tab[i] == NULL) { + break; + } + /* + * Watch out for duplicate entries in the table. + */ + if (i_tab[i] != i_tab[i-1]) { + xfs_iunlock(i_tab[i], lock_mode); + } + } +} + +#ifdef DEBUG +int xfs_rename_skip, xfs_rename_nskip; +#endif + +/* + * The following routine will acquire the locks required for a rename + * operation. The code understands the semantics of renames and will + * validate that name1 exists under dp1 & that name2 may or may not + * exist under dp2. + * + * We are renaming dp1/name1 to dp2/name2. + * + * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success. + */ +STATIC int +xfs_lock_for_rename( + xfs_inode_t *dp1, /* old (source) directory inode */ + xfs_inode_t *dp2, /* new (target) directory inode */ + vname_t *vname1,/* old entry name */ + vname_t *vname2,/* new entry name */ + xfs_inode_t **ipp1, /* inode of old entry */ + xfs_inode_t **ipp2, /* inode of new entry, if it + already exists, NULL otherwise. */ + xfs_inode_t **i_tab,/* array of inode returned, sorted */ + int *num_inodes) /* number of inodes in array */ +{ + xfs_inode_t *ip1, *ip2, *temp; + xfs_ino_t inum1, inum2; + int error; + int i, j; + uint lock_mode; + int diff_dirs = (dp1 != dp2); + + ip2 = NULL; + + /* + * First, find out the current inums of the entries so that we + * can determine the initial locking order. We'll have to + * sanity check stuff after all the locks have been acquired + * to see if we still have the right inodes, directories, etc. + */ + lock_mode = xfs_ilock_map_shared(dp1); + error = xfs_get_dir_entry(vname1, &ip1); + if (error) { + xfs_iunlock_map_shared(dp1, lock_mode); + return error; + } + + inum1 = ip1->i_ino; + + ASSERT(ip1); + ITRACE(ip1); + + /* + * Unlock dp1 and lock dp2 if they are different. + */ + + if (diff_dirs) { + xfs_iunlock_map_shared(dp1, lock_mode); + lock_mode = xfs_ilock_map_shared(dp2); + } + + error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lock_mode, + vname2, &inum2, &ip2); + if (error == ENOENT) { /* target does not need to exist. */ + inum2 = 0; + } else if (error) { + /* + * If dp2 and dp1 are the same, the next line unlocks dp1. + * Got it? + */ + xfs_iunlock_map_shared(dp2, lock_mode); + IRELE (ip1); + return error; + } else { + ITRACE(ip2); + } + + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i_tab[0] = dp1; + i_tab[1] = dp2; + i_tab[2] = ip1; + if (inum2 == 0) { + *num_inodes = 3; + i_tab[3] = NULL; + } else { + *num_inodes = 4; + i_tab[3] = ip2; + } + + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 4 elements to sort, so this is adequate.) + */ + for (i=0; i < *num_inodes; i++) { + for (j=1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } + } + + /* + * We have dp2 locked. If it isn't first, unlock it. + * If it is first, tell xfs_lock_inodes so it can skip it + * when locking. if dp1 == dp2, xfs_lock_inodes will skip both + * since they are equal. xfs_lock_inodes needs all these inodes + * so that it can unlock and retry if there might be a dead-lock + * potential with the log. + */ + + if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) { +#ifdef DEBUG + xfs_rename_skip++; +#endif + xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED); + } else { +#ifdef DEBUG + xfs_rename_nskip++; +#endif + xfs_iunlock_map_shared(dp2, lock_mode); + xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); + } + + /* + * Set the return value. Null out any unused entries in i_tab. + */ + *ipp1 = *ipp2 = NULL; + for (i=0; i < *num_inodes; i++) { + if (i_tab[i]->i_ino == inum1) { + *ipp1 = i_tab[i]; + } + if (i_tab[i]->i_ino == inum2) { + *ipp2 = i_tab[i]; + } + } + for (;i < 4; i++) { + i_tab[i] = NULL; + } + return 0; +} + + +int rename_which_error_return = 0; + +/* + * xfs_rename + */ +int +xfs_rename( + bhv_desc_t *src_dir_bdp, + vname_t *src_vname, + vnode_t *target_dir_vp, + vname_t *target_vname, + cred_t *credp) +{ + xfs_trans_t *tp; + xfs_inode_t *src_dp, *target_dp, *src_ip, *target_ip; + xfs_mount_t *mp; + int new_parent; /* moving to a new dir */ + int src_is_directory; /* src_name is a directory */ + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + xfs_inode_t *inodes[4]; + int target_ip_dropped = 0; /* dropped target_ip link? */ + vnode_t *src_dir_vp; + bhv_desc_t *target_dir_bdp; + int spaceres; + int target_link_zero = 0; + int num_inodes; + char *src_name = VNAME(src_vname); + char *target_name = VNAME(target_vname); + int src_namelen = VNAMELEN(src_vname); + int target_namelen = VNAMELEN(target_vname); + + src_dir_vp = BHV_TO_VNODE(src_dir_bdp); + vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address); + vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address); + + /* + * Find the XFS behavior descriptor for the target directory + * vnode since it was not handed to us. + */ + target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp), + &xfs_vnodeops); + if (target_dir_bdp == NULL) { + return XFS_ERROR(EXDEV); + } + + src_dp = XFS_BHVTOI(src_dir_bdp); + target_dp = XFS_BHVTOI(target_dir_bdp); + mp = src_dp->i_mount; + + if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) || + DM_EVENT_ENABLED(target_dir_vp->v_vfsp, + target_dp, DM_EVENT_RENAME)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, + src_dir_vp, DM_RIGHT_NULL, + target_dir_vp, DM_RIGHT_NULL, + src_name, target_name, + 0, 0, 0); + if (error) { + return error; + } + } + /* Return through std_return after this point. */ + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + * xfs_lock_for_rename() will return ENOENT if src_name + * does not exist in the source directory. + */ + tp = NULL; + error = xfs_lock_for_rename(src_dp, target_dp, src_vname, + target_vname, &src_ip, &target_ip, inodes, + &num_inodes); + + if (error) { + rename_which_error_return = __LINE__; + /* + * We have nothing locked, no inode references, and + * no transaction, so just get out. + */ + goto std_return; + } + + ASSERT(src_ip != NULL); + + if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + /* + * Check for link count overflow on target_dp + */ + if (target_ip == NULL && (src_dp != target_dp) && + target_dp->i_d.di_nlink >= XFS_MAXLINK) { + rename_which_error_return = __LINE__; + error = XFS_ERROR(EMLINK); + xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + goto rele_return; + } + } + + new_parent = (src_dp != target_dp); + src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + + /* + * Drop the locks on our inodes so that we can start the transaction. + */ + xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + + XFS_BMAP_INIT(&free_list, &first_block); + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen); + error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); + if (error == ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); + } + if (error) { + rename_which_error_return = __LINE__; + xfs_trans_cancel(tp, 0); + goto rele_return; + } + + /* + * Attach the dquots to the inodes + */ + if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { + xfs_trans_cancel(tp, cancel_flags); + rename_which_error_return = __LINE__; + goto rele_return; + } + + /* + * Reacquire the inode locks we dropped above. + */ + xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. Note that we need to add a vnode reference to the + * directories since trans_commit & trans_cancel will decrement + * them when they unlock the inodes. Also, we need to be careful + * not to add an inode to the transaction more than once. + */ + VN_HOLD(src_dir_vp); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) { + VN_HOLD(target_dir_vp); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + } + if ((src_ip != src_dp) && (src_ip != target_dp)) { + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + } + if ((target_ip != NULL) && + (target_ip != src_ip) && + (target_ip != src_dp) && + (target_ip != target_dp)) { + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (spaceres == 0 && + (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name, + target_namelen))) { + rename_which_error_return = __LINE__; + goto error_return; + } + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name, + target_namelen, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error == ENOSPC) { + rename_which_error_return = __LINE__; + goto error_return; + } + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + } + } else { /* target_ip != NULL */ + + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + /* + * Make sure target dir is empty. + */ + if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = XFS_ERROR(EEXIST); + rename_which_error_return = __LINE__; + goto error_return; + } + } + + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name, + target_namelen, src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + target_ip_dropped = 1; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + } + + /* Do this test while we still hold the locks */ + target_link_zero = (target_ip)->i_d.di_nlink==0; + + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2, + target_dp->i_ino, &first_block, + &free_list, spaceres); + ASSERT(error != EEXIST); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + } else { + /* + * We always want to hit the ctime on the source inode. + * We do it in the if clause above for the 'new_parent && + * src_is_directory' case, and here we get all the other + * cases. This isn't strictly required by the standards + * since the source inode isn't really being changed, + * but old unix file systems did it and some incremental + * backup programs won't work without it. + */ + xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); + } + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + } + + error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen, + src_ip->i_ino, &first_block, &free_list, spaceres); + if (error) { + rename_which_error_return = __LINE__; + goto abort_return; + } + xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Update the generation counts on all the directory inodes + * that we're modifying. + */ + src_dp->i_gen++; + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + + if (new_parent) { + target_dp->i_gen++; + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + } + + /* + * If there was a target inode, take an extra reference on + * it here so that it doesn't go to xfs_inactive() from + * within the commit. + */ + if (target_ip != NULL) { + IHOLD(target_ip); + } + + /* + * If this is a synchronous mount, make sure that the + * rename transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + /* + * Take refs. for vop_link_removed calls below. No need to worry + * about directory refs. because the caller holds them. + * + * Do holds before the xfs_bmap_finish since it might rele them down + * to zero. + */ + + if (target_ip_dropped) + IHOLD(target_ip); + IHOLD(src_ip); + + error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + if (target_ip != NULL) { + IRELE(target_ip); + } + if (target_ip_dropped) { + IRELE(target_ip); + } + IRELE(src_ip); + goto std_return; + } + + /* + * trans_commit will unlock src_ip, target_ip & decrement + * the vnode references. + */ + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (target_ip != NULL) { + xfs_refcache_purge_ip(target_ip); + IRELE(target_ip); + } + /* + * Let interposed file systems know about removed links. + */ + if (target_ip_dropped) { + VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp, + target_link_zero); + IRELE(target_ip); + } + + FSC_NOTIFY_NAME_CHANGED(XFS_ITOV(src_ip)); + + IRELE(src_ip); + + /* Fall through to std_return with error = 0 or errno from + * xfs_trans_commit */ +std_return: + if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) || + DM_EVENT_ENABLED(target_dir_vp->v_vfsp, + target_dp, DM_EVENT_POSTRENAME)) { + (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, + src_dir_vp, DM_RIGHT_NULL, + target_dir_vp, DM_RIGHT_NULL, + src_name, target_name, + 0, error, 0); + } + return error; + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + /* FALLTHROUGH */ + error_return: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + + rele_return: + IRELE(src_ip); + if (target_ip != NULL) { + IRELE(target_ip); + } + goto std_return; +} diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c new file mode 100644 index 00000000000..2c37822d101 --- /dev/null +++ b/fs/xfs/xfs_rtalloc.c @@ -0,0 +1,2469 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Free realtime space allocation for XFS. + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_fsops.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_inode_item.h" +#include "xfs_trans_space.h" + + +/* + * Prototypes for internal functions. + */ + + +STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *); +STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int, + xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *); +STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, int, xfs_rtblock_t *, int *); +STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_rtblock_t, xfs_rtblock_t *); +STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_rtblock_t, xfs_rtblock_t *); +STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int, + xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *); +STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, + xfs_extlen_t, int); +STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, + xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *); + +/* + * Internal functions. + */ + +/* + * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set. + */ +STATIC int +xfs_lowbit32( + __uint32_t v) +{ + if (v) + return ffs(v) - 1; + return -1; +} + +/* + * Allocate space to the bitmap or summary file, and zero it, for growfs. + */ +STATIC int /* error */ +xfs_growfs_rt_alloc( + xfs_mount_t *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_ino_t ino) /* inode number (bitmap/summary) */ +{ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int cancelflags; /* flags for xfs_trans_cancel */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_inode_t *ip; /* pointer to incore inode */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + xfs_trans_t *tp; /* transaction pointer */ + + /* + * Allocate space to the file, as necessary. + */ + while (oblocks < nblocks) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + cancelflags = 0; + /* + * Reserve space & log for one extent added to the file. + */ + if ((error = xfs_trans_reserve(tp, resblks, + XFS_GROWRTALLOC_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_DEFAULT_PERM_LOG_COUNT))) + goto error_exit; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + /* + * Lock the inode. + */ + if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) + goto error_exit; + XFS_BMAP_INIT(&flist, &firstblock); + /* + * Allocate blocks to the bitmap file. + */ + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = XFS_ERROR(ENOSPC); + if (error) + goto error_exit; + /* + * Free any blocks freed up in the transaction, then commit. + */ + error = xfs_bmap_finish(&tp, &flist, firstblock, &committed); + if (error) + goto error_exit; + xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + /* + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. + */ + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); + /* + * Reserve log for one block zeroing. + */ + if ((error = xfs_trans_reserve(tp, 0, + XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) + goto error_exit; + /* + * Lock the bitmap inode. + */ + if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, + &ip))) + goto error_exit; + /* + * Get a buffer for the block. + */ + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = XFS_ERROR(EIO); + goto error_exit; + } + memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + /* + * Commit the transaction. + */ + xfs_trans_commit(tp, 0, NULL); + } + /* + * Go on to the next extent, if any. + */ + oblocks = map.br_startoff + map.br_blockcount; + } + return 0; +error_exit: + xfs_trans_cancel(tp, cancelflags); + return error; +} + +/* + * Attempt to allocate an extent minlen<=len<=maxlen starting from + * bitmap block bbno. If we don't get maxlen then use prod to trim + * the length, if given. Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_block( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtblock_t *nextp, /* out: next block to try */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_rtblock_t besti; /* best rtblock found so far */ + xfs_rtblock_t bestlen; /* best length found so far */ + xfs_rtblock_t end; /* last rtblock in chunk */ + int error; /* error value */ + xfs_rtblock_t i; /* current rtblock trying */ + xfs_rtblock_t next; /* next rtblock to try */ + int stat; /* status from internal calls */ + + /* + * Loop over all the extents starting in this bitmap block, + * looking for one that's long enough. + */ + for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, + end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + i <= end; + i++) { + /* + * See if there's a free extent of maxlen starting at i. + * If it's not so then next will contain the first non-free. + */ + error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + if (error) { + return error; + } + if (stat) { + /* + * i for maxlen is all free, allocate and return that. + */ + error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, + rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = i; + return 0; + } + /* + * In the case where we have a variable-sized allocation + * request, figure out how big this free piece is, + * and if it's big enough for the minimum, and the best + * so far, remember it. + */ + if (minlen < maxlen) { + xfs_rtblock_t thislen; /* this extent size */ + + thislen = next - i; + if (thislen >= minlen && thislen > bestlen) { + besti = i; + bestlen = thislen; + } + } + /* + * If not done yet, find the start of the next free space. + */ + if (next < end) { + error = xfs_rtfind_forw(mp, tp, next, end, &i); + if (error) { + return error; + } + } else + break; + } + /* + * Searched the whole thing & didn't find a maxlen free extent. + */ + if (minlen < maxlen && besti != -1) { + xfs_extlen_t p; /* amount to trim length by */ + + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1 && (p = do_mod(bestlen, prod))) + bestlen -= p; + /* + * Allocate besti for bestlen & return that. + */ + error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + if (error) { + return error; + } + *len = bestlen; + *rtblock = besti; + return 0; + } + /* + * Allocation failed. Set *nextp to the next block to try. + */ + *nextp = next; + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting at block + * bno. If we don't get maxlen then use prod to trim the length, if given. + * Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_exact( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + xfs_extlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtblock_t next; /* next block to try (dummy) */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * Check if the range in question (for maxlen) is free. + */ + error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + if (error) { + return error; + } + if (isfree) { + /* + * If it is, allocate it and return success. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; + } + /* + * If not, allocate what there is, if it's at least minlen. + */ + maxlen = next - bno; + if (maxlen < minlen) { + /* + * Failed, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + /* + * Trim off tail of extent, if prod is specified. + */ + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) { + /* + * Now we can't do it, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + } + /* + * Allocate what we can and return it. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting as near + * to bno as possible. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_near( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int any; /* any useful extents from summary */ + xfs_rtblock_t bbno; /* bitmap block number */ + int error; /* error value */ + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtblock_t n; /* next block to try */ + xfs_rtblock_t r; /* result block */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * If the block number given is off the end, silently set it to + * the last block. + */ + if (bno >= mp->m_sb.sb_rextents) + bno = mp->m_sb.sb_rextents - 1; + /* + * Try the exact allocation first. + */ + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, + rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If the exact allocation worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + bbno = XFS_BITTOBLOCK(mp, bno); + i = 0; + log2len = xfs_highbit32(minlen); + /* + * Loop over all bitmap blocks (bbno + i is current block). + */ + for (;;) { + /* + * Get summary information of extents of all useful levels + * starting in this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, + bbno + i, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there are any useful extents starting here, try + * allocating one. + */ + if (any) { + /* + * On the positive side of the starting location. + */ + if (i >= 0) { + /* + * Try to allocate an extent starting in + * this block. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return it. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * On the negative side of the starting location. + */ + else { /* i < 0 */ + /* + * Loop backwards through the bitmap blocks from + * the starting point-1 up to where we are now. + * There should be an extent which ends in this + * bitmap block and is long enough. + */ + for (j = -1; j > i; j--) { + /* + * Grab the summary information for + * this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, + log2len, mp->m_rsumlevels - 1, + bbno + j, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there's no extent given in the + * summary that means the extent we + * found must carry over from an + * earlier block. If there is an + * extent given, we've already tried + * that allocation, don't do it again. + */ + if (any) + continue; + error = xfs_rtallocate_extent_block(mp, + tp, bbno + j, minlen, maxlen, + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * There weren't intervening bitmap blocks + * with a long enough extent, or the + * allocation didn't work for some reason + * (i.e. it's a little * too short). + * Try to allocate from the summary block + * that we found. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + } + /* + * Loop control. If we were on the positive side, and there's + * still more blocks on the negative side, go there. + */ + if (i > 0 && (int)bbno - i >= 0) + i = -i; + /* + * If positive, and no more negative, but there are more + * positive, go there. + */ + else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1) + i++; + /* + * If negative or 0 (just started), and there are positive + * blocks to go, go there. The 0 case moves to block 1. + */ + else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1) + i = 1 - i; + /* + * If negative or 0 and there are more negative blocks, + * go there. + */ + else if (i <= 0 && (int)bbno + i > 0) + i--; + /* + * Must be done. Return failure. + */ + else + break; + } + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, with no position + * specified. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_size( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + int i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtblock_t n; /* next block to be tried */ + xfs_rtblock_t r; /* result block number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * Loop over all the levels starting with maxlen. + * At each level, look at all the bitmap blocks, to see if there + * are extents starting there that are long enough (>= maxlen). + * Note, only on the initial level can the allocation fail if + * the summary says there's an extent. + */ + for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { + /* + * Loop over all the bitmap blocks. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, + maxlen, len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Didn't find any maxlen blocks. Try smaller ones, unless + * we're asking for a fixed size extent. + */ + if (minlen > --maxlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* + * Loop over sizes, from maxlen down to minlen. + * This time, when we do the allocations, allow smaller ones + * to succeed. + */ + for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { + /* + * Loop over all the bitmap blocks, try an allocation + * starting in that block. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary information for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * If nothing there, go on to next. + */ + if (!sum) + continue; + /* + * Try the allocation. Make sure the specified + * minlen/maxlen are in the possible range for + * this summary level. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, + XFS_RTMAX(minlen, 1 << l), + XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Got nothing, return failure. + */ + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block allocated > end */ + xfs_rtblock_t preblock; /* first block allocated < start */ + + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); + return error; +} + +/* + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. + */ +STATIC int /* error */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ +{ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* + * Loop over logs of extent sizes. Order is irrelevant. + */ + for (log = low; log <= high; log++) { + /* + * Get one summary datum. + */ + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } + /* + * If there are any, return success. + */ + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +STATIC int /* error */ +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_daddr_t d; /* disk addr of block */ + int error; /* error value */ + xfs_fsblock_t fsb; /* fs block number for block */ + xfs_inode_t *ip; /* bitmap or summary inode */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + /* + * Map from the file offset (block) and inode number to the + * file system block. + */ + error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); + if (error) { + return error; + } + ASSERT(fsb != NULLFSBLOCK); + /* + * Convert to disk address for buffer cache. + */ + d = XFS_FSB_TO_DADDR(mp, fsb); + /* + * Read the buffer. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, 0, &bp); + if (error) { + return error; + } + ASSERT(bp && !XFS_BUF_GETERROR(bp)); + *bpp = bp; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* out: 1 for allocated, 0 for not */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + + return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat); +} +#endif + +#ifdef DEBUG +/* + * Check whether the given block in the bitmap has the given value. + */ +STATIC int /* 1 for matches, 0 for not */ +xfs_rtcheck_bit( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* bit (block) to check */ + int val) /* 1 for free, 0 for allocated */ +{ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* pointer into the buffer */ + /* REFERENCED */ + int error; /* error value */ + xfs_rtword_t wdiff; /* difference between bit & expected */ + int word; /* word number in the buffer */ + xfs_rtword_t wval; /* word value from buffer */ + + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = XFS_BITTOWORD(mp, start); + bit = (int)(start & (XFS_NBWORD - 1)); + wval = bufp[word]; + xfs_trans_brelse(tp, bp); + wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit); + return !wdiff; +} +#endif /* DEBUG */ + +#if 0 +/* + * Check that the given extent (block range) is free already. + */ +STATIC int /* error */ +xfs_rtcheck_free_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* out: 1 for free, 0 for not */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + + return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat); +} +#endif + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +STATIC int /* error */ +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); + if (error) + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); + } + } + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +STATIC int /* error */ +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +STATIC int /* error */ +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +STATIC int /* error */ +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information & copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sum = *sp; + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +STATIC int /* error */ +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Read and modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +STATIC int /* error */ +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information, modify and log it. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sp += delta; + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), + (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); + return 0; +} + +/* + * Visible (exported) functions. + */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_rt_t *in) /* growfs rt input struct */ +{ + xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_buf_t *bp; /* temporary buffer */ + int cancelflags; /* flags for xfs_trans_cancel */ + int error; /* error return value */ + xfs_inode_t *ip; /* bitmap inode, used as lock */ + xfs_mount_t *nmp; /* new (fake) mount structure */ + xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ + xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ + xfs_drtbno_t nrextents; /* new number of realtime extents */ + uint8_t nrextslog; /* new log2 of sb_rextents */ + xfs_extlen_t nrsumblocks; /* new number of summary blocks */ + uint nrsumlevels; /* new rt summary levels */ + uint nrsumsize; /* new size of rt summary, bytes */ + xfs_sb_t *nsbp; /* new superblock */ + xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ + xfs_extlen_t rsumblocks; /* current number of rt summary blks */ + xfs_sb_t *sbp; /* old superblock */ + xfs_fsblock_t sumbno; /* summary block number */ + xfs_trans_t *tp; /* transaction pointer */ + + sbp = &mp->m_sb; + /* + * Initial error checking. + */ + if (mp->m_rtdev_targp || mp->m_rbmip == NULL || + (nrblocks = in->newblocks) <= sbp->sb_rblocks || + (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) + return XFS_ERROR(EINVAL); + /* + * Read in the last block of the device, make sure it exists. + */ + error = xfs_read_buf(mp, mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, in->newblocks - 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp); + if (error) + return error; + ASSERT(bp); + xfs_buf_relse(bp); + /* + * Calculate new parameters. These are the final values to be reached. + */ + nrextents = nrblocks; + do_div(nrextents, in->extsize); + nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize); + nrextslog = xfs_highbit32(nrextents); + nrsumlevels = nrextslog + 1; + nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * New summary size can't be more than half the size of + * the log. This prevents us from getting a log overflow, + * since we'll log basically the whole summary file at once. + */ + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) + return XFS_ERROR(EINVAL); + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size); + rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size); + /* + * Allocate space to the bitmap and summary files, as necessary. + */ + if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, + mp->m_sb.sb_rbmino))) + return error; + if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, + mp->m_sb.sb_rsumino))) + return error; + nmp = NULL; + /* + * Loop over the bitmap blocks. + * We will do everything one bitmap block at a time. + * Skip the current block if it is exactly full. + * This also deals with the case where there were no rtextents before. + */ + for (bmbno = sbp->sb_rbmblocks - + ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); + bmbno < nrbmblocks; + bmbno++) { + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + *nmp = *mp; + nsbp = &nmp->m_sb; + /* + * Calculate new sb and mount fields for this round. + */ + nsbp->sb_rextsize = in->extsize; + nsbp->sb_rbmblocks = bmbno + 1; + nsbp->sb_rblocks = + XFS_RTMIN(nrblocks, + nsbp->sb_rbmblocks * NBBY * + nsbp->sb_blocksize * nsbp->sb_rextsize); + nsbp->sb_rextents = nsbp->sb_rblocks; + do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; + nrsumsize = + (uint)sizeof(xfs_suminfo_t) * nrsumlevels * + nsbp->sb_rbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * Start a transaction, get the log reservation. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); + cancelflags = 0; + if ((error = xfs_trans_reserve(tp, 0, + XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) + goto error_exit; + /* + * Lock out other callers by grabbing the bitmap inode lock. + */ + if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino, + XFS_ILOCK_EXCL, &ip))) + goto error_exit; + ASSERT(ip == mp->m_rbmip); + /* + * Update the bitmap inode's size. + */ + mp->m_rbmip->i_d.di_size = + nsbp->sb_rbmblocks * nsbp->sb_blocksize; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + cancelflags |= XFS_TRANS_ABORT; + /* + * Get the summary inode into the transaction. + */ + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, + 0, XFS_ILOCK_EXCL, &ip))) + goto error_exit; + ASSERT(ip == mp->m_rsumip); + /* + * Update the summary inode's size. + */ + mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); + /* + * Copy summary data from old to new sizes. + * Do this when the real size (not block-aligned) changes. + */ + if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || + mp->m_rsumlevels != nmp->m_rsumlevels) { + error = xfs_rtcopy_summary(mp, nmp, tp); + if (error) + goto error_exit; + } + /* + * Update superblock fields. + */ + if (nsbp->sb_rextsize != sbp->sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nsbp->sb_rextsize - sbp->sb_rextsize); + if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nsbp->sb_rbmblocks - sbp->sb_rbmblocks); + if (nsbp->sb_rblocks != sbp->sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nsbp->sb_rblocks - sbp->sb_rblocks); + if (nsbp->sb_rextents != sbp->sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + if (nsbp->sb_rextslog != sbp->sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nsbp->sb_rextslog - sbp->sb_rextslog); + /* + * Free new extent. + */ + bp = NULL; + error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + if (error) + goto error_exit; + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + /* + * Free the fake mp structure. + */ + kmem_free(nmp, sizeof(*nmp)); + nmp = NULL; + /* + * Update mp values into the real mp structure. + */ + mp->m_rsumlevels = nrsumlevels; + mp->m_rsumsize = nrsumsize; + /* + * Commit the transaction. + */ + xfs_trans_commit(tp, 0, NULL); + } + return 0; + + /* + * Error paths come here. + */ +error_exit: + if (nmp) + kmem_free(nmp, sizeof(*nmp)); + xfs_trans_cancel(tp, cancelflags); + return error; +} + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + xfs_inode_t *ip; /* inode for bitmap file */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_rtblock_t r; /* result allocated block */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + ASSERT(minlen > 0 && minlen <= maxlen); + mp = tp->t_mountp; + /* + * If prod is set then figure out what to do to minlen and maxlen. + */ + if (prod > 1) { + xfs_extlen_t i; + + if ((i = maxlen % prod)) + maxlen -= i; + if ((i = minlen % prod)) + minlen += prod - i; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + } + /* + * Lock out other callers by grabbing the bitmap inode lock. + */ + error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + return error; + } + sumbp = NULL; + /* + * Allocate by size, or near another block, or exactly at some block. + */ + switch (type) { + case XFS_ALLOCTYPE_ANY_AG: + error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, + &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + default: + ASSERT(0); + } + if (error) { + return error; + } + /* + * If it worked, update the superblock. + */ + if (r != NULLRTBLOCK) { + long slen = (long)*len; + + ASSERT(*len >= minlen && *len <= maxlen); + if (wasdel) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); + else + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } + *rtblock = r; + return 0; +} + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_inode_t *ip; /* bitmap file inode */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + mp = tp->t_mountp; + /* + * Synchronize by locking the bitmap inode. + */ + error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + return error; + } +#if defined(__KERNEL__) && defined(DEBUG) + /* + * Check to see that this whole range is currently allocated. + */ + { + int stat; /* result from checking range */ + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat); + if (error) { + return error; + } + ASSERT(stat); + } +#endif + sumbp = NULL; + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&ip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + return 0; +} + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + xfs_buf_t *bp; /* buffer for last block of subvolume */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; /* error return value */ + xfs_sb_t *sbp; /* filesystem superblock copy in mount */ + + sbp = &mp->m_sb; + if (sbp->sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + cmn_err(CE_WARN, + "XFS: This filesystem has a realtime volume, use rtdev=device option"); + return XFS_ERROR(ENODEV); + } + mp->m_rsumlevels = sbp->sb_rextslog + 1; + mp->m_rsumsize = + (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * + sbp->sb_rbmblocks; + mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + mp->m_rbmip = mp->m_rsumip = NULL; + /* + * Check that the realtime section is an ok size. + */ + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { + cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", + (unsigned long long) XFS_BB_TO_FSB(mp, d), + (unsigned long long) mp->m_sb.sb_rblocks); + return XFS_ERROR(E2BIG); + } + error = xfs_read_buf(mp, mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp); + if (error) { + cmn_err(CE_WARN, + "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); + if (error == ENOSPC) + return XFS_ERROR(E2BIG); + return error; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + xfs_mount_t *mp) /* file system mount structure */ +{ + int error; /* error return value */ + xfs_sb_t *sbp; + + sbp = &mp->m_sb; + if (sbp->sb_rbmino == NULLFSINO) + return 0; + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0); + if (error) + return error; + ASSERT(mp->m_rbmip != NULL); + ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); + if (error) { + VN_RELE(XFS_ITOV(mp->m_rbmip)); + return error; + } + ASSERT(mp->m_rsumip != NULL); + return 0; +} + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ +{ + xfs_rtblock_t b; /* result block */ + int error; /* error return value */ + xfs_inode_t *ip; /* bitmap incore inode */ + int log2; /* log of sequence number */ + __uint64_t resid; /* residual after log removed */ + __uint64_t seq; /* sequence number of file creation */ + __uint64_t *seqp; /* pointer to seqno in inode */ + + error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); + if (error) + return error; + ASSERT(ip == mp->m_rbmip); + seqp = (__uint64_t *)&ip->i_d.di_atime; + if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { + ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *seqp = 0; + } + seq = *seqp; + if ((log2 = xfs_highbit64(seq)) == -1) + b = 0; + else { + resid = seq - (1ULL << log2); + b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> + (log2 + 1); + if (b >= mp->m_sb.sb_rextents) + b = do_mod(b, mp->m_sb.sb_rextents); + if (b + len > mp->m_sb.sb_rextents) + b = mp->m_sb.sb_rextents - len; + } + *seqp = seq + 1; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + *pick = b; + return 0; +} + +#ifdef DEBUG +/* + * Debug code: print out the value of a range in the bitmap. + */ +void +xfs_rtprint_range( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to print */ + xfs_extlen_t len) /* length to print */ +{ + xfs_extlen_t i; /* block number in the extent */ + + printk("%Ld: ", (long long)start); + for (i = 0; i < len; i++) + printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1)); + printk("\n"); +} + +/* + * Debug code: print the summary file. + */ +void +xfs_rtprint_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_suminfo_t c; /* summary data */ + xfs_rtblock_t i; /* bitmap block number */ + int l; /* summary information level */ + int p; /* flag for printed anything */ + xfs_fsblock_t sb; /* summary block number */ + xfs_buf_t *sumbp; /* summary block buffer */ + + sumbp = NULL; + for (l = 0; l < mp->m_rsumlevels; l++) { + for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c); + if (c) { + if (!p) { + printk("%Ld-%Ld:", 1LL << l, + XFS_RTMIN((1LL << l) + + ((1LL << l) - 1LL), + mp->m_sb.sb_rextents)); + p = 1; + } + printk(" %Ld:%d", (long long)i, c); + } + } + if (p) + printk("\n"); + } + if (sumbp) + xfs_trans_brelse(tp, sumbp); +} +#endif /* DEBUG */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h new file mode 100644 index 00000000000..e2710264c05 --- /dev/null +++ b/fs/xfs/xfs_rtalloc.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_RTALLOC_H__ +#define __XFS_RTALLOC_H__ + +struct xfs_mount; +struct xfs_trans; + +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#if XFS_BIG_BLKNOS +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) +#else +#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) +#endif + + +#ifdef __KERNEL__ + +#ifdef CONFIG_XFS_RT +/* + * Function prototypes for exported functions. + */ + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock); /* out: start block allocated */ + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len); /* length of extent freed */ + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick); /* result rt extent */ + +/* + * Debug code: print out the value of a range in the bitmap. + */ +void +xfs_rtprint_range( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to print */ + xfs_extlen_t len); /* length to print */ + +/* + * Debug code: print the summary file. + */ +void +xfs_rtprint_summary( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp); /* transaction pointer */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + struct xfs_mount *mp, /* file system mount structure */ + xfs_growfs_rt_t *in); /* user supplied growfs struct */ + +#else +# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtfree_extent(t,b,l) (ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) +# define xfs_growfs_rt(mp,in) (ENOSYS) +# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +#endif /* CONFIG_XFS_RT */ + +#endif /* __KERNEL__ */ + +#endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c new file mode 100644 index 00000000000..d3ff7aef33b --- /dev/null +++ b/fs/xfs/xfs_rw.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_acl.h" +#include "xfs_mac.h" +#include "xfs_error.h" +#include "xfs_buf_item.h" +#include "xfs_rw.h" + +/* + * This is a subroutine for xfs_write() and other writers (xfs_ioctl) + * which clears the setuid and setgid bits when a file is written. + */ +int +xfs_write_clear_setuid( + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_trans_t *tp; + int error; + + mp = ip->i_mount; + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + if ((error = xfs_trans_reserve(tp, 0, + XFS_WRITEID_LOG_RES(mp), + 0, 0, 0))) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + ip->i_d.di_mode &= ~S_ISUID; + + /* + * Note that we don't have to worry about mandatory + * file locking being disabled here because we only + * clear the S_ISGID bit if the Group execute bit is + * on, but if it was on then mandatory locking wouldn't + * have been enabled. + */ + if (ip->i_d.di_mode & S_IXGRP) { + ip->i_d.di_mode &= ~S_ISGID; + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Force a shutdown of the filesystem instantly while keeping + * the filesystem consistent. We don't do an unmount here; just shutdown + * the shop, make sure that absolutely nothing persistent happens to + * this filesystem after this point. + */ + +void +xfs_do_force_shutdown( + bhv_desc_t *bdp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + xfs_mount_t *mp; + + mp = XFS_BHVTOM(bdp); + logerror = flags & XFS_LOG_IO_ERROR; + + if (!(flags & XFS_FORCE_UMOUNT)) { + cmn_err(CE_NOTE, + "xfs_force_shutdown(%s,0x%x) called from line %d of file %s. Return address = 0x%p", + mp->m_fsname,flags,lnnum,fname,__return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations and tells + * them the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & XFS_CORRUPT_INCORE) { + xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, + "Corruption of in-memory data detected. Shutting down filesystem: %s", + mp->m_fsname); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) { + xfs_stack_trace(); + } + } else if (!(flags & XFS_FORCE_UMOUNT)) { + if (logerror) { + xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, + "Log I/O Error Detected. Shutting down filesystem: %s", + mp->m_fsname); + } else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) { + xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, + "I/O Error Detected. Shutting down filesystem: %s", + mp->m_fsname); + } + } + if (!(flags & XFS_FORCE_UMOUNT)) { + cmn_err(CE_ALERT, + "Please umount the filesystem, and rectify the problem(s)"); + } +} + + +/* + * Called when we want to stop a buffer from getting written or read. + * We attach the EIO error, muck with its flags, and call biodone + * so that the proper iodone callbacks get called. + */ +int +xfs_bioerror( + xfs_buf_t *bp) +{ + +#ifdef XFSERRORDEBUG + ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); +#endif + + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + */ + xfs_buftrace("XFS IOERROR", bp); + XFS_BUF_ERROR(bp, EIO); + /* + * We're calling biodone, so delete B_DONE flag. Either way + * we have to call the iodone callback, and calling biodone + * probably is the best way since it takes care of + * GRIO as well. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + xfs_biodone(bp); + + return (EIO); +} + +/* + * Same as xfs_bioerror, except that we are releasing the buffer + * here ourselves, and avoiding the biodone call. + * This is meant for userdata errors; metadata bufs come with + * iodone functions attached, so that we can track down errors. + */ +int +xfs_bioerror_relse( + xfs_buf_t *bp) +{ + int64_t fl; + + ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); + ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); + + xfs_buftrace("XFS IOERRELSE", bp); + fl = XFS_BUF_BFLAGS(bp); + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + * + * chunkhold expects B_DONE to be set, whether + * we actually finish the I/O or not. We don't want to + * change that interface. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_DONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + if (!(fl & XFS_B_ASYNC)) { + /* + * Mark b_error and B_ERROR _both_. + * Lot's of chunkcache code assumes that. + * There's no reason to mark error for + * ASYNC buffers. + */ + XFS_BUF_ERROR(bp, EIO); + XFS_BUF_V_IODONESEMA(bp); + } else { + xfs_buf_relse(bp); + } + return (EIO); +} +/* + * Prints out an ALERT message about I/O error. + */ +void +xfs_ioerror_alert( + char *func, + struct xfs_mount *mp, + xfs_buf_t *bp, + xfs_daddr_t blkno) +{ + cmn_err(CE_ALERT, + "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" + " (\"%s\") error %d buf count %u", + (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, + XFS_BUFTARG_NAME(bp->pb_target), + (__uint64_t)blkno, + func, + XFS_BUF_GETERROR(bp), + XFS_BUF_COUNT(bp)); +} + +/* + * This isn't an absolute requirement, but it is + * just a good idea to call xfs_read_buf instead of + * directly doing a read_buf call. For one, we shouldn't + * be doing this disk read if we are in SHUTDOWN state anyway, + * so this stops that from happening. Secondly, this does all + * the error checking stuff and the brelse if appropriate for + * the caller, so the code can be a little leaner. + */ + +int +xfs_read_buf( + struct xfs_mount *mp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len, + uint flags, + xfs_buf_t **bpp) +{ + xfs_buf_t *bp; + int error; + + if (flags) + bp = xfs_buf_read_flags(target, blkno, len, flags); + else + bp = xfs_buf_read(target, blkno, len, flags); + if (!bp) + return XFS_ERROR(EIO); + error = XFS_BUF_GETERROR(bp); + if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { + *bpp = bp; + } else { + *bpp = NULL; + if (error) { + xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); + } else { + error = XFS_ERROR(EIO); + } + if (bp) { + XFS_BUF_UNDONE(bp); + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_STALE(bp); + /* + * brelse clears B_ERROR and b_error + */ + xfs_buf_relse(bp); + } + } + return (error); +} + +/* + * Wrapper around bwrite() so that we can trap + * write errors, and act accordingly. + */ +int +xfs_bwrite( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + int error; + + /* + * XXXsup how does this work for quotas. + */ + XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); + XFS_BUF_SET_FSPRIVATE3(bp, mp); + XFS_BUF_WRITE(bp); + + if ((error = XFS_bwrite(bp))) { + ASSERT(mp); + /* + * Cannot put a buftrace here since if the buffer is not + * B_HOLD then we will brelse() the buffer before returning + * from bwrite and we could be tracing a buffer that has + * been reused. + */ + xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); + } + return (error); +} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h new file mode 100644 index 00000000000..c8b10bf8f53 --- /dev/null +++ b/fs/xfs/xfs_rw.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_RW_H__ +#define __XFS_RW_H__ + +struct xfs_buf; +struct xfs_inode; +struct xfs_mount; + +/* + * Maximum count of bmaps used by read and write paths. + */ +#define XFS_MAX_RW_NBMAPS 4 + +/* + * Counts of readahead buffers to use based on physical memory size. + * None of these should be more than XFS_MAX_RW_NBMAPS. + */ +#define XFS_RW_NREADAHEAD_16MB 2 +#define XFS_RW_NREADAHEAD_32MB 3 +#define XFS_RW_NREADAHEAD_K32 4 +#define XFS_RW_NREADAHEAD_K64 4 + +/* + * Maximum size of a buffer that we\'ll map. Making this + * too big will degrade performance due to the number of + * pages which need to be gathered. Making it too small + * will prevent us from doing large I/O\'s to hardware that + * needs it. + * + * This is currently set to 512 KB. + */ +#define XFS_MAX_BMAP_LEN_BB 1024 +#define XFS_MAX_BMAP_LEN_BYTES 524288 + +/* + * Convert the given file system block to a disk block. + * We have to treat it differently based on whether the + * file is a real time file or not, because the bmap code + * does. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DB) +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb) +#else +#define XFS_FSB_TO_DB(ip,fsb) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))) +#endif + +#define XFS_FSB_TO_DB_IO(io,fsb) \ + (((io)->io_flags & XFS_IOCORE_RT) ? \ + XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((io)->io_mount, (fsb))) + +/* + * Prototypes for functions in xfs_rw.c. + */ + +int +xfs_write_clear_setuid( + struct xfs_inode *ip); + +int +xfs_bwrite( + struct xfs_mount *mp, + struct xfs_buf *bp); + +int +xfs_bioerror( + struct xfs_buf *b); + +int +xfs_bioerror_relse( + struct xfs_buf *b); + +int +xfs_read_buf( + struct xfs_mount *mp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len, + uint flags, + struct xfs_buf **bpp); + +void +xfs_ioerror_alert( + char *func, + struct xfs_mount *mp, + xfs_buf_t *bp, + xfs_daddr_t blkno); + + +/* + * Prototypes for functions in xfs_vnodeops.c. + */ + +int +xfs_rwlock( + bhv_desc_t *bdp, + vrwlock_t write_lock); + +void +xfs_rwunlock( + bhv_desc_t *bdp, + vrwlock_t write_lock); + +int +xfs_change_file_space( + bhv_desc_t *bdp, + int cmd, + xfs_flock64_t *bf, + xfs_off_t offset, + cred_t *credp, + int flags); + +int +xfs_set_dmattrs( + bhv_desc_t *bdp, + u_int evmask, + u_int16_t state, + cred_t *credp); + +#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h new file mode 100644 index 00000000000..ad090a834ce --- /dev/null +++ b/fs/xfs/xfs_sb.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SB_H__ +#define __XFS_SB_H__ + +/* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ + +struct xfs_buf; +struct xfs_mount; + +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_SASHFBITS 0xf000 +#define XFS_SB_VERSION_REALFBITS 0x0ff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 +#define XFS_SB_VERSION_OKSASHFBITS \ + (XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT) +#define XFS_SB_VERSION_OKREALFBITS \ + (XFS_SB_VERSION_ATTRBIT | \ + XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_QUOTABIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_DALIGNBIT | \ + XFS_SB_VERSION_SHAREDBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_SECTORBIT) +#define XFS_SB_VERSION_OKSASHBITS \ + (XFS_SB_VERSION_NUMBITS | \ + XFS_SB_VERSION_REALFBITS | \ + XFS_SB_VERSION_OKSASHFBITS) +#define XFS_SB_VERSION_OKREALBITS \ + (XFS_SB_VERSION_NUMBITS | \ + XFS_SB_VERSION_OKREALFBITS | \ + XFS_SB_VERSION_OKSASHFBITS) +#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits) \ + (((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \ + (morebits)) ? \ + (XFS_SB_VERSION_4 | \ + ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \ + ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \ + ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \ + ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \ + ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \ + ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \ + ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \ + XFS_SB_VERSION_1) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_SASHFBITS 0xff000000 /* Mask: features that + require changing + PROM and SASH */ + +#define XFS_SB_VERSION2_OKREALFBITS \ + (0) +#define XFS_SB_VERSION2_OKSASHFBITS \ + (0) +#define XFS_SB_VERSION2_OKREALBITS \ + (XFS_SB_VERSION2_OKREALFBITS | \ + XFS_SB_VERSION2_OKSASHFBITS ) + +/* + * mkfs macro to set up sb_features2 word + */ +#define XFS_SB_VERSION2_MKFS(xyz) \ + ((xyz) ? 0 : 0) + +typedef struct xfs_sb +{ + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_drfsbno_t sb_dblocks; /* number of data blocks */ + xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */ + xfs_drtbno_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_dfsbno_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additonal feature bits */ +} xfs_sb_t; + +/* + * Sequence number values for the fields. + */ +typedef enum { + XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, + XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, + XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, + XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, + XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, + XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, + XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, + XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, + XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, + XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, + XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, + XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, + XFS_SBS_FEATURES2, + XFS_SBS_FIELDCOUNT +} xfs_sb_field_t; + +/* + * Mask values, defined based on the xfs_sb_field_t values. + * Only define the ones we're using. + */ +#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) +#define XFS_SB_UUID XFS_SB_MVAL(UUID) +#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) +#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) +#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) +#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) +#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) +#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) +#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) +#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) +#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) +#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) +#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) +#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) +#define XFS_SB_MOD_BITS \ + (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ + XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ + XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH) + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_NUM) +int xfs_sb_version_num(xfs_sb_t *sbp); +#define XFS_SB_VERSION_NUM(sbp) xfs_sb_version_num(sbp) +#else +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_GOOD_VERSION) +int xfs_sb_good_version(xfs_sb_t *sbp); +#define XFS_SB_GOOD_VERSION(sbp) xfs_sb_good_version(sbp) +#else +#define XFS_SB_GOOD_VERSION_INT(sbp) \ + ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \ + ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + !(((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ + (((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ + ((sbp)->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) + +#ifdef __KERNEL__ +#define XFS_SB_GOOD_VERSION(sbp) \ + (XFS_SB_GOOD_VERSION_INT(sbp) && \ + (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN) )) +#else +/* + * extra 2 paren's here (( to unconfuse paren-matching editors + * like vi because XFS_SB_GOOD_VERSION_INT is a partial expression + * and the two XFS_SB_GOOD_VERSION's each 2 more close paren's to + * complete the expression. + */ +#define XFS_SB_GOOD_VERSION(sbp) \ + (XFS_SB_GOOD_VERSION_INT(sbp) && \ + (!((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \ + (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)) )) +#endif /* __KERNEL__ */ +#endif + +#define XFS_SB_GOOD_SASH_VERSION(sbp) \ + ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \ + ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS))) + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TONEW) +unsigned xfs_sb_version_tonew(unsigned v); +#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v) +#else +#define XFS_SB_VERSION_TONEW(v) \ + ((((v) == XFS_SB_VERSION_1) ? \ + 0 : \ + (((v) == XFS_SB_VERSION_2) ? \ + XFS_SB_VERSION_ATTRBIT : \ + (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \ + XFS_SB_VERSION_4) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TOOLD) +unsigned xfs_sb_version_toold(unsigned v); +#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v) +#else +#define XFS_SB_VERSION_TOOLD(v) \ + (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \ + 0 : \ + (((v) & XFS_SB_VERSION_NLINKBIT) ? \ + XFS_SB_VERSION_3 : \ + (((v) & XFS_SB_VERSION_ATTRBIT) ? \ + XFS_SB_VERSION_2 : \ + XFS_SB_VERSION_1))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASATTR) +int xfs_sb_version_hasattr(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASATTR(sbp) xfs_sb_version_hasattr(sbp) +#else +#define XFS_SB_VERSION_HASATTR(sbp) \ + (((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \ + ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDATTR) +void xfs_sb_version_addattr(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDATTR(sbp) xfs_sb_version_addattr(sbp) +#else +#define XFS_SB_VERSION_ADDATTR(sbp) \ + ((sbp)->sb_versionnum = \ + (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \ + XFS_SB_VERSION_2 : \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \ + (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASNLINK) +int xfs_sb_version_hasnlink(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASNLINK(sbp) xfs_sb_version_hasnlink(sbp) +#else +#define XFS_SB_VERSION_HASNLINK(sbp) \ + (((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDNLINK) +void xfs_sb_version_addnlink(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDNLINK(sbp) xfs_sb_version_addnlink(sbp) +#else +#define XFS_SB_VERSION_ADDNLINK(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \ + XFS_SB_VERSION_3 : \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASQUOTA) +int xfs_sb_version_hasquota(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASQUOTA(sbp) xfs_sb_version_hasquota(sbp) +#else +#define XFS_SB_VERSION_HASQUOTA(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDQUOTA) +void xfs_sb_version_addquota(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDQUOTA(sbp) xfs_sb_version_addquota(sbp) +#else +#define XFS_SB_VERSION_ADDQUOTA(sbp) \ + ((sbp)->sb_versionnum = \ + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \ + (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \ + XFS_SB_VERSION_QUOTABIT))) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASALIGN) +int xfs_sb_version_hasalign(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASALIGN(sbp) xfs_sb_version_hasalign(sbp) +#else +#define XFS_SB_VERSION_HASALIGN(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBALIGN) +void xfs_sb_version_subalign(xfs_sb_t *sbp); +#define XFS_SB_VERSION_SUBALIGN(sbp) xfs_sb_version_subalign(sbp) +#else +#define XFS_SB_VERSION_SUBALIGN(sbp) \ + ((sbp)->sb_versionnum = \ + XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDALIGN) +int xfs_sb_version_hasdalign(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASDALIGN(sbp) xfs_sb_version_hasdalign(sbp) +#else +#define XFS_SB_VERSION_HASDALIGN(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDDALIGN) +int xfs_sb_version_adddalign(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDDALIGN(sbp) xfs_sb_version_adddalign(sbp) +#else +#define XFS_SB_VERSION_ADDDALIGN(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSHARED) +int xfs_sb_version_hasshared(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASSHARED(sbp) xfs_sb_version_hasshared(sbp) +#else +#define XFS_SB_VERSION_HASSHARED(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDSHARED) +int xfs_sb_version_addshared(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDSHARED(sbp) xfs_sb_version_addshared(sbp) +#else +#define XFS_SB_VERSION_ADDSHARED(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBSHARED) +int xfs_sb_version_subshared(xfs_sb_t *sbp); +#define XFS_SB_VERSION_SUBSHARED(sbp) xfs_sb_version_subshared(sbp) +#else +#define XFS_SB_VERSION_SUBSHARED(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDIRV2) +int xfs_sb_version_hasdirv2(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASDIRV2(sbp) xfs_sb_version_hasdirv2(sbp) +#else +#define XFS_SB_VERSION_HASDIRV2(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASLOGV2) +int xfs_sb_version_haslogv2(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASLOGV2(sbp) xfs_sb_version_haslogv2(sbp) +#else +#define XFS_SB_VERSION_HASLOGV2(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT) +int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) xfs_sb_version_hasextflgbit(sbp) +#else +#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDEXTFLGBIT) +int xfs_sb_version_addextflgbit(xfs_sb_t *sbp); +#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) xfs_sb_version_addextflgbit(sbp) +#else +#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBEXTFLGBIT) +int xfs_sb_version_subextflgbit(xfs_sb_t *sbp); +#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) xfs_sb_version_subextflgbit(sbp) +#else +#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) \ + ((sbp)->sb_versionnum = \ + ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSECTOR) +int xfs_sb_version_hassector(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASSECTOR(sbp) xfs_sb_version_hassector(sbp) +#else +#define XFS_SB_VERSION_HASSECTOR(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASMOREBITSBIT) +int xfs_sb_version_hasmorebits(xfs_sb_t *sbp); +#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp) +#else +#define XFS_SB_VERSION_HASMOREBITS(sbp) \ + ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)) +#endif + +/* + * sb_features2 bit version macros. + * + * For example, for a bit defined as XFS_SB_VERSION2_YBIT, has a macro: + * + * SB_VERSION_HASYBIT(xfs_sb_t *sbp) + * ((XFS_SB_VERSION_HASMOREBITS(sbp) && + * ((sbp)->sb_versionnum & XFS_SB_VERSION2_YBIT) + */ + +/* + * end of superblock version macros + */ + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_BLOCK) +xfs_agblock_t xfs_sb_block(struct xfs_mount *mp); +#define XFS_SB_BLOCK(mp) xfs_sb_block(mp) +#else +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_HDR_BLOCK) +xfs_agblock_t xfs_hdr_block(struct xfs_mount *mp, xfs_daddr_t d); +#define XFS_HDR_BLOCK(mp,d) xfs_hdr_block(mp,d) +#else +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp,d))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_FSB) +xfs_fsblock_t xfs_daddr_to_fsb(struct xfs_mount *mp, xfs_daddr_t d); +#define XFS_DADDR_TO_FSB(mp,d) xfs_daddr_to_fsb(mp,d) +#else +#define XFS_DADDR_TO_FSB(mp,d) \ + XFS_AGB_TO_FSB(mp, XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DADDR) +xfs_daddr_t xfs_fsb_to_daddr(struct xfs_mount *mp, xfs_fsblock_t fsbno); +#define XFS_FSB_TO_DADDR(mp,fsbno) xfs_fsb_to_daddr(mp,fsbno) +#else +#define XFS_FSB_TO_DADDR(mp,fsbno) \ + XFS_AGB_TO_DADDR(mp, XFS_FSB_TO_AGNO(mp,fsbno), \ + XFS_FSB_TO_AGBNO(mp,fsbno)) +#endif + +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBP) +xfs_sb_t *xfs_buf_to_sbp(struct xfs_buf *bp); +#define XFS_BUF_TO_SBP(bp) xfs_buf_to_sbp(bp) +#else +#define XFS_BUF_TO_SBP(bp) ((xfs_sb_t *)XFS_BUF_PTR(bp)) +#endif + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) +#define XFS_BB_TO_FSS(mp,bb) \ + (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log) +#define XFS_BB_TO_FSST(mp,bb) ((bb) >> (mp)->m_sectbb_log) + +/* + * File system sector to byte conversions. + */ +#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog) +#define XFS_B_TO_FSST(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) +#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1)) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +#endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c new file mode 100644 index 00000000000..3db0e220077 --- /dev/null +++ b/fs/xfs/xfs_trans.c @@ -0,0 +1,1315 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trans_priv.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" + + +STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); +STATIC uint xfs_trans_count_vecs(xfs_trans_t *); +STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); +STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); +STATIC void xfs_trans_committed(xfs_trans_t *, int); +STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); +STATIC void xfs_trans_free(xfs_trans_t *); + +kmem_zone_t *xfs_trans_zone; + + +/* + * Initialize the precomputed transaction reservation values + * in the mount structure. + */ +void +xfs_trans_init( + xfs_mount_t *mp) +{ + xfs_trans_reservations_t *resp; + + resp = &(mp->m_reservations); + resp->tr_write = + (uint)(XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_itruncate = + (uint)(XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_rename = + (uint)(XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_link = (uint)XFS_CALC_LINK_LOG_RES(mp); + resp->tr_remove = + (uint)(XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_symlink = + (uint)(XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_create = + (uint)(XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_mkdir = + (uint)(XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_ifree = + (uint)(XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_ichange = + (uint)(XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_growdata = (uint)XFS_CALC_GROWDATA_LOG_RES(mp); + resp->tr_swrite = (uint)XFS_CALC_SWRITE_LOG_RES(mp); + resp->tr_writeid = (uint)XFS_CALC_WRITEID_LOG_RES(mp); + resp->tr_addafork = + (uint)(XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_attrinval = (uint)XFS_CALC_ATTRINVAL_LOG_RES(mp); + resp->tr_attrset = + (uint)(XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_attrrm = + (uint)(XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp)); + resp->tr_clearagi = (uint)XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); + resp->tr_growrtalloc = (uint)XFS_CALC_GROWRTALLOC_LOG_RES(mp); + resp->tr_growrtzero = (uint)XFS_CALC_GROWRTZERO_LOG_RES(mp); + resp->tr_growrtfree = (uint)XFS_CALC_GROWRTFREE_LOG_RES(mp); +} + +/* + * This routine is called to allocate a transaction structure. + * The type parameter indicates the type of the transaction. These + * are enumerated in xfs_trans.h. + * + * Dynamically allocate the transaction structure from the transaction + * zone, initialize it, and return it to the caller. + */ +xfs_trans_t * +xfs_trans_alloc( + xfs_mount_t *mp, + uint type) +{ + fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS); + atomic_inc(&mp->m_active_trans); + + return (_xfs_trans_alloc(mp, type)); + +} + +xfs_trans_t * +_xfs_trans_alloc( + xfs_mount_t *mp, + uint type) +{ + xfs_trans_t *tp; + + ASSERT(xfs_trans_zone != NULL); + tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + + /* + * Initialize the transaction structure. + */ + tp->t_magic = XFS_TRANS_MAGIC; + tp->t_type = type; + tp->t_mountp = mp; + tp->t_items_free = XFS_LIC_NUM_SLOTS; + tp->t_busy_free = XFS_LBC_NUM_SLOTS; + XFS_LIC_INIT(&(tp->t_items)); + XFS_LBC_INIT(&(tp->t_busy)); + + return (tp); +} + +/* + * This is called to create a new transaction which will share the + * permanent log reservation of the given transaction. The remaining + * unused block and rt extent reservations are also inherited. This + * implies that the original transaction is no longer allowed to allocate + * blocks. Locks and log items, however, are no inherited. They must + * be added to the new transaction explicitly. + */ +xfs_trans_t * +xfs_trans_dup( + xfs_trans_t *tp) +{ + xfs_trans_t *ntp; + + ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + + /* + * Initialize the new transaction structure. + */ + ntp->t_magic = XFS_TRANS_MAGIC; + ntp->t_type = tp->t_type; + ntp->t_mountp = tp->t_mountp; + ntp->t_items_free = XFS_LIC_NUM_SLOTS; + ntp->t_busy_free = XFS_LBC_NUM_SLOTS; + XFS_LIC_INIT(&(ntp->t_items)); + XFS_LBC_INIT(&(ntp->t_busy)); + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + +#if defined(XLOG_NOLOG) || defined(DEBUG) + ASSERT(!xlog_debug || tp->t_ticket != NULL); +#else + ASSERT(tp->t_ticket != NULL); +#endif + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_ticket = tp->t_ticket; + ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; + tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; + tp->t_rtx_res = tp->t_rtx_res_used; + PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags); + + XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp); + + atomic_inc(&tp->t_mountp->m_active_trans); + return ntp; +} + +/* + * This is called to reserve free disk blocks and log space for the + * given transaction. This must be done before allocating any resources + * within the transaction. + * + * This will return ENOSPC if there are not enough blocks available. + * It will sleep waiting for available log space. + * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which + * is used by long running transactions. If any one of the reservations + * fails then they will all be backed out. + * + * This does not do quota reservations. That typically is done by the + * caller afterwards. + */ +int +xfs_trans_reserve( + xfs_trans_t *tp, + uint blocks, + uint logspace, + uint rtextents, + uint flags, + uint logcount) +{ + int log_flags; + int error; + int rsvd; + + error = 0; + rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* Mark this thread as being in a transaction */ + PFLAGS_SET_FSTRANS(&tp->t_pflags); + + /* + * Attempt to reserve the needed disk blocks by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (blocks > 0) { + error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, + -blocks, rsvd); + if (error != 0) { + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + return (XFS_ERROR(ENOSPC)); + } + tp->t_blk_res += blocks; + } + + /* + * Reserve the log space needed for this transaction. + */ + if (logspace > 0) { + ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); + ASSERT((tp->t_log_count == 0) || + (tp->t_log_count == logcount)); + if (flags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_PERM_RESERV; + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + } else { + ASSERT(tp->t_ticket == NULL); + ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + log_flags = 0; + } + + error = xfs_log_reserve(tp->t_mountp, logspace, logcount, + &tp->t_ticket, + XFS_TRANSACTION, log_flags); + if (error) { + goto undo_blocks; + } + tp->t_log_res = logspace; + tp->t_log_count = logcount; + } + + /* + * Attempt to reserve the needed realtime extents by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (rtextents > 0) { + error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, + -rtextents, rsvd); + if (error) { + error = XFS_ERROR(ENOSPC); + goto undo_log; + } + tp->t_rtx_res += rtextents; + } + + return 0; + + /* + * Error cases jump to one of these labels to undo any + * reservations which have already been performed. + */ +undo_log: + if (logspace > 0) { + if (flags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; + } + +undo_blocks: + if (blocks > 0) { + (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, + blocks, rsvd); + tp->t_blk_res = 0; + } + + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + + return (error); +} + + +/* + * This is called to set the a callback to be called when the given + * transaction is committed to disk. The transaction pointer and the + * argument pointer will be passed to the callback routine. + * + * Only one callback can be associated with any single transaction. + */ +void +xfs_trans_callback( + xfs_trans_t *tp, + xfs_trans_callback_t callback, + void *arg) +{ + ASSERT(tp->t_callback == NULL); + tp->t_callback = callback; + tp->t_callarg = arg; +} + + +/* + * Record the indicated change to the given field for application + * to the file system's superblock when the transaction commits. + * For now, just store the change in the transaction structure. + * + * Mark the transaction structure to indicate that the superblock + * needs to be updated before committing. + */ +void +xfs_trans_mod_sb( + xfs_trans_t *tp, + uint field, + long delta) +{ + + switch (field) { + case XFS_TRANS_SB_ICOUNT: + tp->t_icount_delta += delta; + break; + case XFS_TRANS_SB_IFREE: + tp->t_ifree_delta += delta; + break; + case XFS_TRANS_SB_FDBLOCKS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_blk_res_used += (uint)-delta; + ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + } + tp->t_fdblocks_delta += delta; + break; + case XFS_TRANS_SB_RES_FDBLOCKS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_fdblocks_delta += delta; + break; + case XFS_TRANS_SB_FREXTENTS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_rtx_res_used += (uint)-delta; + ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); + } + tp->t_frextents_delta += delta; + break; + case XFS_TRANS_SB_RES_FREXTENTS: + /* + * The allocation has already been applied to the + * in-core superblocks's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_frextents_delta += delta; + break; + case XFS_TRANS_SB_DBLOCKS: + ASSERT(delta > 0); + tp->t_dblocks_delta += delta; + break; + case XFS_TRANS_SB_AGCOUNT: + ASSERT(delta > 0); + tp->t_agcount_delta += delta; + break; + case XFS_TRANS_SB_IMAXPCT: + tp->t_imaxpct_delta += delta; + break; + case XFS_TRANS_SB_REXTSIZE: + tp->t_rextsize_delta += delta; + break; + case XFS_TRANS_SB_RBMBLOCKS: + tp->t_rbmblocks_delta += delta; + break; + case XFS_TRANS_SB_RBLOCKS: + tp->t_rblocks_delta += delta; + break; + case XFS_TRANS_SB_REXTENTS: + tp->t_rextents_delta += delta; + break; + case XFS_TRANS_SB_REXTSLOG: + tp->t_rextslog_delta += delta; + break; + default: + ASSERT(0); + return; + } + + tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY); +} + +/* + * xfs_trans_apply_sb_deltas() is called from the commit code + * to bring the superblock buffer into the current transaction + * and modify it as requested by earlier calls to xfs_trans_mod_sb(). + * + * For now we just look at each field allowed to change and change + * it if necessary. + */ +STATIC void +xfs_trans_apply_sb_deltas( + xfs_trans_t *tp) +{ + xfs_sb_t *sbp; + xfs_buf_t *bp; + int whole = 0; + + bp = xfs_trans_getsb(tp, tp->t_mountp, 0); + sbp = XFS_BUF_TO_SBP(bp); + + /* + * Check that superblock mods match the mods made to AGF counters. + */ + ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == + (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + + tp->t_ag_btree_delta)); + + if (tp->t_icount_delta != 0) { + INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); + } + if (tp->t_ifree_delta != 0) { + INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); + } + + if (tp->t_fdblocks_delta != 0) { + INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); + } + if (tp->t_res_fdblocks_delta != 0) { + INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); + } + + if (tp->t_frextents_delta != 0) { + INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta); + } + if (tp->t_res_frextents_delta != 0) { + INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_res_frextents_delta); + } + if (tp->t_dblocks_delta != 0) { + INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta); + whole = 1; + } + if (tp->t_agcount_delta != 0) { + INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta); + whole = 1; + } + if (tp->t_imaxpct_delta != 0) { + INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta); + whole = 1; + } + if (tp->t_rextsize_delta != 0) { + INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta); + whole = 1; + } + if (tp->t_rbmblocks_delta != 0) { + INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta); + whole = 1; + } + if (tp->t_rblocks_delta != 0) { + INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta); + whole = 1; + } + if (tp->t_rextents_delta != 0) { + INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta); + whole = 1; + } + if (tp->t_rextslog_delta != 0) { + INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta); + whole = 1; + } + + if (whole) + /* + * Log the whole thing, the fields are discontiguous. + */ + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1); + else + /* + * Since all the modifiable fields are contiguous, we + * can get away with this. + */ + xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount), + offsetof(xfs_sb_t, sb_frextents) + + sizeof(sbp->sb_frextents) - 1); + + XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1; +} + +/* + * xfs_trans_unreserve_and_mod_sb() is called to release unused + * reservations and apply superblock counter changes to the in-core + * superblock. + * + * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). + */ +void +xfs_trans_unreserve_and_mod_sb( + xfs_trans_t *tp) +{ + xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ + xfs_mod_sb_t *msbp; + /* REFERENCED */ + int error; + int rsvd; + + msbp = msb; + rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* + * Release any reserved blocks. Any that were allocated + * will be taken back again by fdblocks_delta below. + */ + if (tp->t_blk_res > 0) { + msbp->msb_field = XFS_SBS_FDBLOCKS; + msbp->msb_delta = tp->t_blk_res; + msbp++; + } + + /* + * Release any reserved real time extents . Any that were + * allocated will be taken back again by frextents_delta below. + */ + if (tp->t_rtx_res > 0) { + msbp->msb_field = XFS_SBS_FREXTENTS; + msbp->msb_delta = tp->t_rtx_res; + msbp++; + } + + /* + * Apply any superblock modifications to the in-core version. + * The t_res_fdblocks_delta and t_res_frextents_delta fields are + * explicity NOT applied to the in-core superblock. + * The idea is that that has already been done. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (tp->t_icount_delta != 0) { + msbp->msb_field = XFS_SBS_ICOUNT; + msbp->msb_delta = (int)tp->t_icount_delta; + msbp++; + } + if (tp->t_ifree_delta != 0) { + msbp->msb_field = XFS_SBS_IFREE; + msbp->msb_delta = (int)tp->t_ifree_delta; + msbp++; + } + if (tp->t_fdblocks_delta != 0) { + msbp->msb_field = XFS_SBS_FDBLOCKS; + msbp->msb_delta = (int)tp->t_fdblocks_delta; + msbp++; + } + if (tp->t_frextents_delta != 0) { + msbp->msb_field = XFS_SBS_FREXTENTS; + msbp->msb_delta = (int)tp->t_frextents_delta; + msbp++; + } + if (tp->t_dblocks_delta != 0) { + msbp->msb_field = XFS_SBS_DBLOCKS; + msbp->msb_delta = (int)tp->t_dblocks_delta; + msbp++; + } + if (tp->t_agcount_delta != 0) { + msbp->msb_field = XFS_SBS_AGCOUNT; + msbp->msb_delta = (int)tp->t_agcount_delta; + msbp++; + } + if (tp->t_imaxpct_delta != 0) { + msbp->msb_field = XFS_SBS_IMAX_PCT; + msbp->msb_delta = (int)tp->t_imaxpct_delta; + msbp++; + } + if (tp->t_rextsize_delta != 0) { + msbp->msb_field = XFS_SBS_REXTSIZE; + msbp->msb_delta = (int)tp->t_rextsize_delta; + msbp++; + } + if (tp->t_rbmblocks_delta != 0) { + msbp->msb_field = XFS_SBS_RBMBLOCKS; + msbp->msb_delta = (int)tp->t_rbmblocks_delta; + msbp++; + } + if (tp->t_rblocks_delta != 0) { + msbp->msb_field = XFS_SBS_RBLOCKS; + msbp->msb_delta = (int)tp->t_rblocks_delta; + msbp++; + } + if (tp->t_rextents_delta != 0) { + msbp->msb_field = XFS_SBS_REXTENTS; + msbp->msb_delta = (int)tp->t_rextents_delta; + msbp++; + } + if (tp->t_rextslog_delta != 0) { + msbp->msb_field = XFS_SBS_REXTSLOG; + msbp->msb_delta = (int)tp->t_rextslog_delta; + msbp++; + } + } + + /* + * If we need to change anything, do it. + */ + if (msbp > msb) { + error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, + (uint)(msbp - msb), rsvd); + ASSERT(error == 0); + } +} + + +/* + * xfs_trans_commit + * + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. + */ + /*ARGSUSED*/ +int +xfs_trans_commit( + xfs_trans_t *tp, + uint flags, + xfs_lsn_t *commit_lsn_p) +{ + xfs_log_iovec_t *log_vector; + int nvec; + xfs_mount_t *mp; + xfs_lsn_t commit_lsn; + /* REFERENCED */ + int error; + int log_flags; + int sync; +#define XFS_TRANS_LOGVEC_COUNT 16 + xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; +#if defined(XLOG_NOLOG) || defined(DEBUG) + static xfs_lsn_t trans_lsn = 1; +#endif + void *commit_iclog; + int shutdown; + + commit_lsn = -1; + + /* + * Determine whether this commit is releasing a permanent + * log reservation or not. + */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + mp = tp->t_mountp; + + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ +shut_us_down: + shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; + if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { + xfs_trans_unreserve_and_mod_sb(tp); + /* + * It is indeed possible for the transaction to be + * not dirty but the dqinfo portion to be. All that + * means is that we have some (non-persistent) quota + * reservations that need to be unreserved. + */ + XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, + NULL, log_flags); + if (commit_lsn == -1 && !shutdown) + shutdown = XFS_ERROR(EIO); + } + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); + xfs_trans_free_busy(tp); + xfs_trans_free(tp); + XFS_STATS_INC(xs_trans_empty); + if (commit_lsn_p) + *commit_lsn_p = commit_lsn; + return (shutdown); + } +#if defined(XLOG_NOLOG) || defined(DEBUG) + ASSERT(!xlog_debug || tp->t_ticket != NULL); +#else + ASSERT(tp->t_ticket != NULL); +#endif + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + xfs_trans_apply_sb_deltas(tp); + } + XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp); + + /* + * Ask each log item how many log_vector entries it will + * need so we can figure out how many to allocate. + * Try to avoid the kmem_alloc() call in the common case + * by using a vector from the stack when it fits. + */ + nvec = xfs_trans_count_vecs(tp); + + if (nvec == 0) { + xfs_force_shutdown(mp, XFS_LOG_IO_ERROR); + goto shut_us_down; + } + + + if (nvec <= XFS_TRANS_LOGVEC_COUNT) { + log_vector = log_vector_fast; + } else { + log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * + sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + + /* + * Fill in the log_vector and pin the logged items, and + * then write the transaction to the log. + */ + xfs_trans_fill_vecs(tp, log_vector); + + /* + * Ignore errors here. xfs_log_done would do the right thing. + * We need to put the ticket, etc. away. + */ + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, + &(tp->t_lsn)); + +#if defined(XLOG_NOLOG) || defined(DEBUG) + if (xlog_debug) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, + &commit_iclog, log_flags); + } else { + commit_lsn = 0; + tp->t_lsn = trans_lsn++; + } +#else + /* + * This is the regular case. At this point (after the call finishes), + * the transaction is committed incore and could go out to disk at + * any time. However, all the items associated with the transaction + * are still locked and pinned in memory. + */ + commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); +#endif + + tp->t_commit_lsn = commit_lsn; + if (nvec > XFS_TRANS_LOGVEC_COUNT) { + kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t)); + } + + if (commit_lsn_p) + *commit_lsn_p = commit_lsn; + + /* + * If we got a log write error. Unpin the logitems that we + * had pinned, clean up, free trans structure, and return error. + */ + if (error || commit_lsn == -1) { + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); + return XFS_ERROR(EIO); + } + + /* + * Once the transaction has committed, unused + * reservations need to be released and changes to + * the superblock need to be reflected in the in-core + * version. Do that now. + */ + xfs_trans_unreserve_and_mod_sb(tp); + + sync = tp->t_flags & XFS_TRANS_SYNC; + + /* + * Tell the LM to call the transaction completion routine + * when the log write with LSN commit_lsn completes (e.g. + * when the transaction commit really hits the on-disk log). + * After this call we cannot reference tp, because the call + * can happen at any time and the call will free the transaction + * structure pointed to by tp. The only case where we call + * the completion routine (xfs_trans_committed) directly is + * if the log is turned off on a debug kernel or we're + * running in simulation mode (the log is explicitly turned + * off). + */ + tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; + tp->t_logcb.cb_arg = tp; + + /* + * We need to pass the iclog buffer which was used for the + * transaction commit record into this function, and attach + * the callback to it. The callback must be attached before + * the items are unlocked to avoid racing with other threads + * waiting for an item to unlock. + */ + shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); + + /* + * Mark this thread as no longer being in a transaction + */ + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + + /* + * Once all the items of the transaction have been copied + * to the in core log and the callback is attached, the + * items can be unlocked. + * + * This will free descriptors pointing to items which were + * not logged since there is nothing more to do with them. + * For items which were logged, we will keep pointers to them + * so they can be unpinned after the transaction commits to disk. + * This will also stamp each modified meta-data item with + * the commit lsn of this transaction for dependency tracking + * purposes. + */ + xfs_trans_unlock_items(tp, commit_lsn); + + /* + * If we detected a log error earlier, finish committing + * the transaction now (unpin log items, etc). + * + * Order is critical here, to avoid using the transaction + * pointer after its been freed (by xfs_trans_committed + * either here now, or as a callback). We cannot do this + * step inside xfs_log_notify as was done earlier because + * of this issue. + */ + if (shutdown) + xfs_trans_committed(tp, XFS_LI_ABORTED); + + /* + * Now that the xfs_trans_committed callback has been attached, + * and the items are released we can finally allow the iclog to + * go to disk. + */ + error = xfs_log_release_iclog(mp, commit_iclog); + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + if (!error) + error = xfs_log_force(mp, commit_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); + } + + return (error); +} + + +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +STATIC uint +xfs_trans_count_vecs( + xfs_trans_t *tp) +{ + int nvecs; + xfs_log_item_desc_t *lidp; + + nvecs = 1; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (lidp == NULL) + return 0; + + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + lidp = xfs_trans_next_item(tp, lidp); + } + + return nvecs; +} + +/* + * Called from the trans_commit code when we notice that + * the filesystem is in the middle of a forced shutdown. + */ +STATIC void +xfs_trans_uncommit( + xfs_trans_t *tp, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { + /* + * Unpin all but those that aren't dirty. + */ + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN_REMOVE(lidp->lid_item, tp); + } + + xfs_trans_unreserve_and_mod_sb(tp); + XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); + + xfs_trans_free_items(tp, flags); + xfs_trans_free_busy(tp); + xfs_trans_free(tp); +} + +/* + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). + * + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +STATIC void +xfs_trans_fill_vecs( + xfs_trans_t *tp, + xfs_log_iovec_t *log_vector) +{ + xfs_log_item_desc_t *lidp; + xfs_log_iovec_t *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; /* pointer arithmetic */ + + nitems = 0; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + /* + * The item may be marked dirty but not log anything. + * This can be used to get called when a transaction + * is committed. + */ + if (lidp->lid_size) { + nitems++; + } + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; /* pointer arithmetic */ + IOP_PIN(lidp->lid_item); + lidp = xfs_trans_next_item(tp, lidp); + } + + /* + * Now that we've counted the number of items in this + * transaction, fill in the transaction header. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); +} + + +/* + * Unlock all of the transaction's items and free the transaction. + * The transaction must not have modified any of its items, because + * there is no way to restore them to their previous state. + * + * If the transaction has made a log reservation, make sure to release + * it as well. + */ +void +xfs_trans_cancel( + xfs_trans_t *tp, + int flags) +{ + int log_flags; +#ifdef DEBUG + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_log_item_t *lip; + int i; +#endif + + /* + * See if the caller is being too lazy to figure out if + * the transaction really needs an abort. + */ + if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) + flags &= ~XFS_TRANS_ABORT; + /* + * See if the caller is relying on us to shut down the + * filesystem. This happens in paths where we detect + * corruption and decide to give up. + */ + if ((tp->t_flags & XFS_TRANS_DIRTY) && + !XFS_FORCED_SHUTDOWN(tp->t_mountp)) + xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE); +#ifdef DEBUG + if (!(flags & XFS_TRANS_ABORT)) { + licp = &(tp->t_items); + while (licp != NULL) { + lidp = licp->lic_descs; + for (i = 0; i < licp->lic_unused; i++, lidp++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lip = lidp->lid_item; + if (!XFS_FORCED_SHUTDOWN(tp->t_mountp)) + ASSERT(!(lip->li_type == XFS_LI_EFD)); + } + licp = licp->lic_next; + } + } +#endif + xfs_trans_unreserve_and_mod_sb(tp); + XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); + + if (tp->t_ticket) { + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + } + + /* mark this thread as no longer being in a transaction */ + PFLAGS_RESTORE_FSTRANS(&tp->t_pflags); + + xfs_trans_free_items(tp, flags); + xfs_trans_free_busy(tp); + xfs_trans_free(tp); +} + + +/* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + xfs_trans_t *tp) +{ + atomic_dec(&tp->t_mountp->m_active_trans); + XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); + kmem_zone_free(xfs_trans_zone, tp); +} + + +/* + * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). + * + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + * + * Call xfs_trans_chunk_committed() to process the items in + * each chunk. + */ +STATIC void +xfs_trans_committed( + xfs_trans_t *tp, + int abortflag) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + xfs_log_busy_chunk_t *lbcp; + xfs_log_busy_slot_t *lbsp; + int i; + + /* + * Call the transaction's completion callback if there + * is one. + */ + if (tp->t_callback != NULL) { + tp->t_callback(tp, tp->t_callarg); + } + + /* + * Special case the chunk embedded in the transaction. + */ + licp = &(tp->t_items); + if (!(XFS_LIC_ARE_ALL_FREE(licp))) { + xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); + } + + /* + * Process the items in each chunk in turn. + */ + licp = licp->lic_next; + while (licp != NULL) { + ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); + next_licp = licp->lic_next; + kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + licp = next_licp; + } + + /* + * Clear all the per-AG busy list items listed in this transaction + */ + lbcp = &tp->t_busy; + while (lbcp != NULL) { + for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { + if (!XFS_LBC_ISFREE(lbcp, i)) { + xfs_alloc_clear_busy(tp, lbsp->lbc_ag, + lbsp->lbc_idx); + } + } + lbcp = lbcp->lbc_next; + } + xfs_trans_free_busy(tp); + + /* + * That's it for the transaction structure. Free it. + */ + xfs_trans_free(tp); +} + +/* + * This is called to perform the commit processing for each + * item described by the given chunk. + * + * The commit processing consists of unlocking items which were + * held locked with the SYNC_UNLOCK attribute, calling the committed + * routine of each logged item, updating the item's position in the AIL + * if necessary, and unpinning each item. If the committed routine + * returns -1, then do nothing further with the item because it + * may have been freed. + * + * Since items are unlocked when they are copied to the incore + * log, it is possible for two transactions to be completing + * and manipulating the same item simultaneously. The AIL lock + * will protect the lsn field of each item. The value of this + * field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because + * otherwise they could be immediately flushed and we'd have to race + * with the flusher trying to pull the item from the AIL as we add it. + */ +STATIC void +xfs_trans_chunk_committed( + xfs_log_item_chunk_t *licp, + xfs_lsn_t lsn, + int aborted) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_t *lip; + xfs_lsn_t item_lsn; + struct xfs_mount *mp; + int i; + SPLDECL(s); + + lidp = licp->lic_descs; + for (i = 0; i < licp->lic_unused; i++, lidp++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lip = lidp->lid_item; + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + + /* + * Send in the ABORTED flag to the COMMITTED routine + * so that it knows whether the transaction was aborted + * or not. + */ + item_lsn = IOP_COMMITTED(lip, lsn); + + /* + * If the committed routine returns -1, make + * no more references to the item. + */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { + continue; + } + + /* + * If the returned lsn is greater than what it + * contained before, update the location of the + * item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it + * is possible that a later transaction completing + * simultaneously with an earlier one using the + * same item could complete first with a higher lsn. + * This would cause the earlier transaction to fail + * the test below. + */ + mp = lip->li_mountp; + AIL_LOCK(mp,s); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn + * and update the position of the item in + * the AIL. + * + * xfs_trans_update_ail() drops the AIL lock. + */ + xfs_trans_update_ail(mp, lip, item_lsn, s); + } else { + AIL_UNLOCK(mp, s); + } + + /* + * Now that we've repositioned the item in the AIL, + * unpin it so it can be flushed. Pass information + * about buffer stale state down from the log item + * flags, if anyone else stales the buffer we do not + * want to pay any attention to it. + */ + IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); + } +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h new file mode 100644 index 00000000000..bd37ccb85e7 --- /dev/null +++ b/fs/xfs/xfs_trans.h @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_TRANS_H__ +#define __XFS_TRANS_H__ + +/* + * This is the structure written in the log at the head of + * every transaction. It identifies the type and id of the + * transaction, and contains the number of items logged by + * the transaction so we know how many to expect during recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_5_3_BUF 0x1234 /* v1 bufs, 1-block inode buffers */ +#define XFS_LI_5_3_INODE 0x1235 /* 1-block inode buffers */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_6_1_INODE 0x1239 /* 4K non-aligned inode bufs */ +#define XFS_LI_6_1_BUF 0x123a /* v1, 4K inode buffers */ +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e + +/* + * Transaction types. Used to distinguish types of buffers. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +#define XFS_TRANS_WRITE_SYNC 17 +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +/* new transaction types need to be reflected in xfs_logprint(8) */ + + +#ifdef __KERNEL__ +struct xfs_buf; +struct xfs_buftarg; +struct xfs_efd_log_item; +struct xfs_efi_log_item; +struct xfs_inode; +struct xfs_item_ops; +struct xfs_log_iovec; +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot_acct; + +typedef struct xfs_ail_entry { + struct xfs_log_item *ail_forw; /* AIL forw pointer */ + struct xfs_log_item *ail_back; /* AIL back pointer */ +} xfs_ail_entry_t; + +/* + * This structure is passed as a parameter to xfs_trans_push_ail() + * and is used to track the what LSN the waiting processes are + * waiting to become unused. + */ +typedef struct xfs_ail_ticket { + xfs_lsn_t at_lsn; /* lsn waitin for */ + struct xfs_ail_ticket *at_forw; /* wait list ptr */ + struct xfs_ail_ticket *at_back; /* wait list ptr */ + sv_t at_sema; /* wait sema */ +} xfs_ail_ticket_t; + + +typedef struct xfs_log_item { + xfs_ail_entry_t li_ail; /* AIL pointers */ + xfs_lsn_t li_lsn; /* last on-disk lsn */ + struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ + struct xfs_mount *li_mountp; /* ptr to fs mount */ + uint li_type; /* item type */ + uint li_flags; /* misc flags */ + struct xfs_log_item *li_bio_list; /* buffer item list */ + void (*li_cb)(struct xfs_buf *, + struct xfs_log_item *); + /* buffer item iodone */ + /* callback func */ + struct xfs_item_ops *li_ops; /* function list */ +} xfs_log_item_t; + +#define XFS_LI_IN_AIL 0x1 +#define XFS_LI_ABORTED 0x2 + +typedef struct xfs_item_ops { + uint (*iop_size)(xfs_log_item_t *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_pin)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); + uint (*iop_trylock)(xfs_log_item_t *); + void (*iop_unlock)(xfs_log_item_t *); + xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_push)(xfs_log_item_t *); + void (*iop_abort)(xfs_log_item_t *); + void (*iop_pushbuf)(xfs_log_item_t *); + void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); +} xfs_item_ops_t; + +#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) +#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) +#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) +#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) +#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) +#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) +#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) +#define IOP_ABORT(ip) (*(ip)->li_ops->iop_abort)(ip) +#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) +#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) + +/* + * Return values for the IOP_TRYLOCK() routines. + */ +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 +#define XFS_ITEM_PUSHBUF 4 + +#endif /* __KERNEL__ */ + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +typedef struct xfs_log_item_desc { + xfs_log_item_t *lid_item; + ushort lid_size; + unsigned char lid_flags; + unsigned char lid_index; +} xfs_log_item_desc_t; + +#define XFS_LID_DIRTY 0x1 +#define XFS_LID_PINNED 0x2 +#define XFS_LID_BUF_STALE 0x8 + +/* + * This structure is used to maintain a chunk list of log_item_desc + * structures. The free field is a bitmask indicating which descriptors + * in this chunk's array are free. The unused field is the first value + * not used since this chunk was allocated. + */ +#define XFS_LIC_NUM_SLOTS 15 +typedef struct xfs_log_item_chunk { + struct xfs_log_item_chunk *lic_next; + ushort lic_free; + ushort lic_unused; + xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS]; +} xfs_log_item_chunk_t; + +#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1) +#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1) + + +/* + * Initialize the given chunk. Set the chunk's free descriptor mask + * to indicate that all descriptors are free. The caller gets to set + * lic_unused to the right value (0 matches all free). The + * lic_descs.lid_index values are set up as each desc is allocated. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT) +void xfs_lic_init(xfs_log_item_chunk_t *cp); +#define XFS_LIC_INIT(cp) xfs_lic_init(cp) +#else +#define XFS_LIC_INIT(cp) ((cp)->lic_free = XFS_LIC_FREEMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT_SLOT) +void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot); +#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot) +#else +#define XFS_LIC_INIT_SLOT(cp,slot) \ + ((cp)->lic_descs[slot].lid_index = (unsigned char)(slot)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_VACANCY) +int xfs_lic_vacancy(xfs_log_item_chunk_t *cp); +#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp) +#else +#define XFS_LIC_VACANCY(cp) (((cp)->lic_free) & XFS_LIC_FREEMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ALL_FREE) +void xfs_lic_all_free(xfs_log_item_chunk_t *cp); +#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp) +#else +#define XFS_LIC_ALL_FREE(cp) ((cp)->lic_free = XFS_LIC_FREEMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ARE_ALL_FREE) +int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp); +#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp) +#else +#define XFS_LIC_ARE_ALL_FREE(cp) (((cp)->lic_free & XFS_LIC_FREEMASK) ==\ + XFS_LIC_FREEMASK) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ISFREE) +int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot); +#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot) +#else +#define XFS_LIC_ISFREE(cp,slot) ((cp)->lic_free & (1 << (slot))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_CLAIM) +void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot); +#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot) +#else +#define XFS_LIC_CLAIM(cp,slot) ((cp)->lic_free &= ~(1 << (slot))) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_RELSE) +void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot); +#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot) +#else +#define XFS_LIC_RELSE(cp,slot) ((cp)->lic_free |= 1 << (slot)) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_SLOT) +xfs_log_item_desc_t *xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot); +#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot) +#else +#define XFS_LIC_SLOT(cp,slot) (&((cp)->lic_descs[slot])) +#endif +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_SLOT) +int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp); +#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp) +#else +#define XFS_LIC_DESC_TO_SLOT(dp) ((uint)((dp)->lid_index)) +#endif +/* + * Calculate the address of a chunk given a descriptor pointer: + * dp - dp->lid_index give the address of the start of the lic_descs array. + * From this we subtract the offset of the lic_descs field in a chunk. + * All of this yields the address of the chunk, which is + * cast to a chunk pointer. + */ +#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_CHUNK) +xfs_log_item_chunk_t *xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp); +#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp) +#else +#define XFS_LIC_DESC_TO_CHUNK(dp) ((xfs_log_item_chunk_t*) \ + (((xfs_caddr_t)((dp) - (dp)->lid_index)) -\ + (xfs_caddr_t)(((xfs_log_item_chunk_t*) \ + 0)->lic_descs))) +#endif + +#ifdef __KERNEL__ +/* + * This structure is used to maintain a list of block ranges that have been + * freed in the transaction. The ranges are listed in the perag[] busy list + * between when they're freed and the transaction is committed to disk. + */ + +typedef struct xfs_log_busy_slot { + xfs_agnumber_t lbc_ag; + ushort lbc_idx; /* index in perag.busy[] */ +} xfs_log_busy_slot_t; + +#define XFS_LBC_NUM_SLOTS 31 +typedef struct xfs_log_busy_chunk { + struct xfs_log_busy_chunk *lbc_next; + uint lbc_free; /* bitmask of free slots */ + ushort lbc_unused; /* first unused */ + xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; +} xfs_log_busy_chunk_t; + +#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) +#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) + +#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) +#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) +#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) +#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) +#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) + +/* + * This is the type of function which can be given to xfs_trans_callback() + * to be called upon the transaction's commit to disk. + */ +typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); + +/* + * This is the structure maintained for every active transaction. + */ +typedef struct xfs_trans { + unsigned int t_magic; /* magic number */ + xfs_log_callback_t t_logcb; /* log callback struct */ + struct xfs_trans *t_forw; /* async list pointers */ + struct xfs_trans *t_back; /* async list pointers */ + unsigned int t_type; /* transaction type */ + unsigned int t_log_res; /* amt of log space resvd */ + unsigned int t_log_count; /* count for perm log res */ + unsigned int t_blk_res; /* # of blocks resvd */ + unsigned int t_blk_res_used; /* # of resvd blocks used */ + unsigned int t_rtx_res; /* # of rt extents resvd */ + unsigned int t_rtx_res_used; /* # of resvd rt extents used */ + xfs_log_ticket_t t_ticket; /* log mgr ticket */ + sema_t t_sema; /* sema for commit completion */ + xfs_lsn_t t_lsn; /* log seq num of start of + * transaction. */ + xfs_lsn_t t_commit_lsn; /* log seq num of end of + * transaction. */ + struct xfs_mount *t_mountp; /* ptr to fs mount struct */ + struct xfs_dquot_acct *t_dqinfo; /* accting info for dquots */ + xfs_trans_callback_t t_callback; /* transaction callback */ + void *t_callarg; /* callback arg */ + unsigned int t_flags; /* misc flags */ + long t_icount_delta; /* superblock icount change */ + long t_ifree_delta; /* superblock ifree change */ + long t_fdblocks_delta; /* superblock fdblocks chg */ + long t_res_fdblocks_delta; /* on-disk only chg */ + long t_frextents_delta;/* superblock freextents chg*/ + long t_res_frextents_delta; /* on-disk only chg */ + long t_ag_freeblks_delta; /* debugging counter */ + long t_ag_flist_delta; /* debugging counter */ + long t_ag_btree_delta; /* debugging counter */ + long t_dblocks_delta;/* superblock dblocks change */ + long t_agcount_delta;/* superblock agcount change */ + long t_imaxpct_delta;/* superblock imaxpct change */ + long t_rextsize_delta;/* superblock rextsize chg */ + long t_rbmblocks_delta;/* superblock rbmblocks chg */ + long t_rblocks_delta;/* superblock rblocks change */ + long t_rextents_delta;/* superblocks rextents chg */ + long t_rextslog_delta;/* superblocks rextslog chg */ + unsigned int t_items_free; /* log item descs free */ + xfs_log_item_chunk_t t_items; /* first log item desc chunk */ + xfs_trans_header_t t_header; /* header for in-log trans */ + unsigned int t_busy_free; /* busy descs free */ + xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + xfs_pflags_t t_pflags; /* saved pflags state */ +} xfs_trans_t; + +#endif /* __KERNEL__ */ + + +#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ + +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_NOSLEEP 0x1 +#define XFS_TRANS_WAIT 0x2 +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + + +/* + * Various log reservation values. + * These are based on the size of the file system block + * because that is what most transactions manipulate. + * Each adds in an additional 128 bytes per item logged to + * try to account for the overhead of the transaction mechanism. + * + * Note: + * Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() + * call. This is because the number in the worst case is quite high + * and quite unusual. In order to fix this we need to change + * xfs_bmap_finish() to free extents in only a single AG at a time. + * This will require changes to the EFI code as well, however, so that + * the EFI for the extents not freed is logged again in each transaction. + * See bug 261917. + */ + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode\'s bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_WRITE_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ + (2 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 2) + \ + (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\ + ((2 * (mp)->m_sb.sb_sectsize) + \ + (2 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 2) + \ + (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) + +#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode\'s bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \ + (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ + ((4 * (mp)->m_sb.sb_sectsize) + \ + (4 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 4) + \ + (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ + (128 * 5) + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) + +#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_RENAME_LOG_RES(mp) \ + (MAX( \ + ((4 * (mp)->m_sb.sb_inodesize) + \ + (2 * XFS_DIROP_LOG_RES(mp)) + \ + (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \ + ((3 * (mp)->m_sb.sb_sectsize) + \ + (3 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 3) + \ + (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))))) + +#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_LINK_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_inodesize + \ + XFS_DIROP_LOG_RES(mp) + \ + (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ + ((mp)->m_sb.sb_sectsize + \ + (mp)->m_sb.sb_sectsize + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) + +#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_REMOVE_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_inodesize + \ + XFS_DIROP_LOG_RES(mp) + \ + (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ + ((2 * (mp)->m_sb.sb_sectsize) + \ + (2 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 2) + \ + (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) + +#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) + +/* + * For symlink we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: 1 block + * the directory btree: (max depth + v2) * dir block size + * the directory inode\'s bmap btree: (max depth + v2) * block size + * the blocks for the symlink: 1 KB + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_SYMLINK_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_inodesize + \ + XFS_FSB_TO_B(mp, 1) + \ + XFS_DIROP_LOG_RES(mp) + \ + 1024 + \ + (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ + (2 * (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ + XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) + +#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode\'s bmap btree: (max depth + v2) * block size + * Or in the first xact we allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +#define XFS_CALC_CREATE_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B(mp, 1) + \ + XFS_DIROP_LOG_RES(mp) + \ + (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ + (3 * (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ + XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) + +#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) + +/* + * Making a new directory is the same as creating a new file. + */ +#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp) + +#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +#define XFS_CALC_IFREE_LOG_RES(mp) \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_sectsize + \ + (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B((mp), 1) + \ + MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ + (128 * 5) + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) + + +#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_sectsize + 512) + +#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +#define XFS_CALC_GROWDATA_LOG_RES(mp) \ + ((mp)->m_sb.sb_sectsize * 3 + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) + +#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \ + (2 * (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ + (mp)->m_sb.sb_inodesize + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * \ + (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) + +#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \ + ((mp)->m_sb.sb_blocksize + 128) + +#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \ + ((mp)->m_sb.sb_sectsize + \ + 2 * (mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_blocksize + \ + (mp)->m_rsumsize + \ + (128 * 5)) + +#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +#define XFS_CALC_SWRITE_LOG_RES(mp) \ + ((mp)->m_sb.sb_inodesize + 128) + +#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) + +/* + * Logging the inode timestamps on an fsync -- same as SWRITE + * as long as SWRITE logs the entire inode core + */ +#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +#define XFS_CALC_WRITEID_LOG_RES(mp) \ + ((mp)->m_sb.sb_inodesize + 128) + +#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +#define XFS_CALC_ADDAFORK_LOG_RES(mp) \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_sectsize * 2 + \ + (mp)->m_dirblksize + \ + (XFS_DIR_IS_V1(mp) ? 0 : \ + XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \ + XFS_ALLOCFREE_LOG_RES(mp, 1) + \ + (128 * (4 + \ + (XFS_DIR_IS_V1(mp) ? 0 : \ + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) + +#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode\'s bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ + (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \ + ((4 * (mp)->m_sb.sb_sectsize) + \ + (4 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 4) + \ + (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))))) + +#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) + +/* + * Setting an attribute. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime. + */ +#define XFS_CALC_ATTRSET_LOG_RES(mp) \ + ((mp)->m_sb.sb_inodesize + \ + (mp)->m_sb.sb_sectsize + \ + XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ + (128 * (2 + XFS_DA_NODE_MAXDEPTH))) + +#define XFS_ATTRSET_LOG_RES(mp, ext) \ + ((mp)->m_reservations.tr_attrset + \ + (ext * (mp)->m_sb.sb_sectsize) + \ + (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ + (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +#define XFS_CALC_ATTRRM_LOG_RES(mp) \ + (MAX( \ + ((mp)->m_sb.sb_inodesize + \ + XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ + XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ + (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ + ((2 * (mp)->m_sb.sb_sectsize) + \ + (2 * (mp)->m_sb.sb_sectsize) + \ + (mp)->m_sb.sb_sectsize + \ + XFS_ALLOCFREE_LOG_RES(mp, 2) + \ + (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) + +#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \ + ((mp)->m_sb.sb_sectsize + 128) + +#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) + + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +/* + * Here we centralize the specification of XFS meta-data buffer + * reference count values. This determine how hard the buffer + * cache tries to hold onto the buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_INO_REF 1 +#define XFS_DQUOT_REF 1 + +#ifdef __KERNEL__ +/* + * XFS transaction mechanism exported interfaces that are + * actually macros. + */ +#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) +#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) +#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) +#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) + +#ifdef DEBUG +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (long)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (long)d) +#else +#define xfs_trans_agblocks_delta(tp, d) +#define xfs_trans_agflist_delta(tp, d) +#define xfs_trans_agbtree_delta(tp, d) +#endif + +/* + * XFS transaction mechanism exported interfaces. + */ +void xfs_trans_init(struct xfs_mount *); +xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *xfs_trans_dup(xfs_trans_t *); +int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, + uint, uint); +void xfs_trans_callback(xfs_trans_t *, + void (*)(xfs_trans_t *, void *), void *); +void xfs_trans_mod_sb(xfs_trans_t *, uint, long); +struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, + int, uint); +int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, + struct xfs_buftarg *, xfs_daddr_t, int, uint, + struct xfs_buf **); +struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); + +void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); +void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); +int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, + xfs_ino_t , uint, uint, struct xfs_inode **); +void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); +void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); +void xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *); +void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); +struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); +void xfs_efi_release(struct xfs_efi_log_item *, uint); +void xfs_trans_log_efi_extent(xfs_trans_t *, + struct xfs_efi_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + struct xfs_efi_log_item *, + uint); +void xfs_trans_log_efd_extent(xfs_trans_t *, + struct xfs_efd_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +int xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *); +void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_ail_init(struct xfs_mount *); +xfs_lsn_t xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); +xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *); +void xfs_trans_unlocked_item(struct xfs_mount *, + xfs_log_item_t *); +xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, + xfs_agnumber_t ag, + xfs_extlen_t idx); + +#endif /* __KERNEL__ */ + +#endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c new file mode 100644 index 00000000000..7bc5eab4c2c --- /dev/null +++ b/fs/xfs/xfs_trans_ail.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" + +STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *); +STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *); + +#ifdef DEBUG +STATIC void xfs_ail_check(xfs_ail_entry_t *); +#else +#define xfs_ail_check(a) +#endif /* DEBUG */ + + +/* + * This is called by the log manager code to determine the LSN + * of the tail of the log. This is exactly the LSN of the first + * item in the AIL. If the AIL is empty, then this function + * returns 0. + * + * We need the AIL lock in order to get a coherent read of the + * lsn of the last item in the AIL. + */ +xfs_lsn_t +xfs_trans_tail_ail( + xfs_mount_t *mp) +{ + xfs_lsn_t lsn; + xfs_log_item_t *lip; + SPLDECL(s); + + AIL_LOCK(mp,s); + lip = xfs_ail_min(&(mp->m_ail)); + if (lip == NULL) { + lsn = (xfs_lsn_t)0; + } else { + lsn = lip->li_lsn; + } + AIL_UNLOCK(mp, s); + + return lsn; +} + +/* + * xfs_trans_push_ail + * + * This routine is called to move the tail of the AIL + * forward. It does this by trying to flush items in the AIL + * whose lsns are below the given threshold_lsn. + * + * The routine returns the lsn of the tail of the log. + */ +xfs_lsn_t +xfs_trans_push_ail( + xfs_mount_t *mp, + xfs_lsn_t threshold_lsn) +{ + xfs_lsn_t lsn; + xfs_log_item_t *lip; + int gen; + int restarts; + int lock_result; + int flush_log; + SPLDECL(s); + +#define XFS_TRANS_PUSH_AIL_RESTARTS 10 + + AIL_LOCK(mp,s); + lip = xfs_trans_first_ail(mp, &gen); + if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) { + /* + * Just return if the AIL is empty. + */ + AIL_UNLOCK(mp, s); + return (xfs_lsn_t)0; + } + + XFS_STATS_INC(xs_push_ail); + + /* + * While the item we are looking at is below the given threshold + * try to flush it out. Make sure to limit the number of times + * we allow xfs_trans_next_ail() to restart scanning from the + * beginning of the list. We'd like not to stop until we've at least + * tried to push on everything in the AIL with an LSN less than + * the given threshold. However, we may give up before that if + * we realize that we've been holding the AIL_LOCK for 'too long', + * blocking interrupts. Currently, too long is < 500us roughly. + */ + flush_log = 0; + restarts = 0; + while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) && + (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) { + /* + * If we can lock the item without sleeping, unlock + * the AIL lock and flush the item. Then re-grab the + * AIL lock so we can look for the next item on the + * AIL. Since we unlock the AIL while we flush the + * item, the next routine may start over again at the + * the beginning of the list if anything has changed. + * That is what the generation count is for. + * + * If we can't lock the item, either its holder will flush + * it or it is already being flushed or it is being relogged. + * In any of these case it is being taken care of and we + * can just skip to the next item in the list. + */ + lock_result = IOP_TRYLOCK(lip); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + AIL_UNLOCK(mp, s); + XFS_STATS_INC(xs_push_ail_success); + IOP_PUSH(lip); + AIL_LOCK(mp,s); + break; + + case XFS_ITEM_PUSHBUF: + AIL_UNLOCK(mp, s); + XFS_STATS_INC(xs_push_ail_pushbuf); +#ifdef XFSRACEDEBUG + delay_for_intr(); + delay(300); +#endif + ASSERT(lip->li_ops->iop_pushbuf); + ASSERT(lip); + IOP_PUSHBUF(lip); + AIL_LOCK(mp,s); + break; + + case XFS_ITEM_PINNED: + XFS_STATS_INC(xs_push_ail_pinned); + flush_log = 1; + break; + + case XFS_ITEM_LOCKED: + XFS_STATS_INC(xs_push_ail_locked); + break; + + case XFS_ITEM_FLUSHING: + XFS_STATS_INC(xs_push_ail_flushing); + break; + + default: + ASSERT(0); + break; + } + + lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); + if (lip == NULL) { + break; + } + if (XFS_FORCED_SHUTDOWN(mp)) { + /* + * Just return if we shut down during the last try. + */ + AIL_UNLOCK(mp, s); + return (xfs_lsn_t)0; + } + + } + + if (flush_log) { + /* + * If something we need to push out was pinned, then + * push out the log so it will become unpinned and + * move forward in the AIL. + */ + AIL_UNLOCK(mp, s); + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + AIL_LOCK(mp, s); + } + + lip = xfs_ail_min(&(mp->m_ail)); + if (lip == NULL) { + lsn = (xfs_lsn_t)0; + } else { + lsn = lip->li_lsn; + } + + AIL_UNLOCK(mp, s); + return lsn; +} /* xfs_trans_push_ail */ + + +/* + * This is to be called when an item is unlocked that may have + * been in the AIL. It will wake up the first member of the AIL + * wait list if this item's unlocking might allow it to progress. + * If the item is in the AIL, then we need to get the AIL lock + * while doing our checking so we don't race with someone going + * to sleep waiting for this event in xfs_trans_push_ail(). + */ +void +xfs_trans_unlocked_item( + xfs_mount_t *mp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *min_lip; + + /* + * If we're forcibly shutting down, we may have + * unlocked log items arbitrarily. The last thing + * we want to do is to move the tail of the log + * over some potentially valid data. + */ + if (!(lip->li_flags & XFS_LI_IN_AIL) || + XFS_FORCED_SHUTDOWN(mp)) { + return; + } + + /* + * This is the one case where we can call into xfs_ail_min() + * without holding the AIL lock because we only care about the + * case where we are at the tail of the AIL. If the object isn't + * at the tail, it doesn't matter what result we get back. This + * is slightly racy because since we were just unlocked, we could + * go to sleep between the call to xfs_ail_min and the call to + * xfs_log_move_tail, have someone else lock us, commit to us disk, + * move us out of the tail of the AIL, and then we wake up. However, + * the call to xfs_log_move_tail() doesn't do anything if there's + * not enough free space to wake people up so we're safe calling it. + */ + min_lip = xfs_ail_min(&mp->m_ail); + + if (min_lip == lip) + xfs_log_move_tail(mp, 1); +} /* xfs_trans_unlocked_item */ + + +/* + * Update the position of the item in the AIL with the new + * lsn. If it is not yet in the AIL, add it. Otherwise, move + * it to its new position by removing it and re-adding it. + * + * Wakeup anyone with an lsn less than the item's lsn. If the item + * we move in the AIL is the minimum one, update the tail lsn in the + * log manager. + * + * Increment the AIL's generation count to indicate that the tree + * has changed. + * + * This function must be called with the AIL lock held. The lock + * is dropped before returning, so the caller must pass in the + * cookie returned by AIL_LOCK. + */ +void +xfs_trans_update_ail( + xfs_mount_t *mp, + xfs_log_item_t *lip, + xfs_lsn_t lsn, + unsigned long s) +{ + xfs_ail_entry_t *ailp; + xfs_log_item_t *dlip=NULL; + xfs_log_item_t *mlip; /* ptr to minimum lip */ + + ailp = &(mp->m_ail); + mlip = xfs_ail_min(ailp); + + if (lip->li_flags & XFS_LI_IN_AIL) { + dlip = xfs_ail_delete(ailp, lip); + ASSERT(dlip == lip); + } else { + lip->li_flags |= XFS_LI_IN_AIL; + } + + lip->li_lsn = lsn; + + xfs_ail_insert(ailp, lip); + mp->m_ail_gen++; + + if (mlip == dlip) { + mlip = xfs_ail_min(&(mp->m_ail)); + AIL_UNLOCK(mp, s); + xfs_log_move_tail(mp, mlip->li_lsn); + } else { + AIL_UNLOCK(mp, s); + } + + +} /* xfs_trans_update_ail */ + +/* + * Delete the given item from the AIL. It must already be in + * the AIL. + * + * Wakeup anyone with an lsn less than item's lsn. If the item + * we delete in the AIL is the minimum one, update the tail lsn in the + * log manager. + * + * Clear the IN_AIL flag from the item, reset its lsn to 0, and + * bump the AIL's generation count to indicate that the tree + * has changed. + * + * This function must be called with the AIL lock held. The lock + * is dropped before returning, so the caller must pass in the + * cookie returned by AIL_LOCK. + */ +void +xfs_trans_delete_ail( + xfs_mount_t *mp, + xfs_log_item_t *lip, + unsigned long s) +{ + xfs_ail_entry_t *ailp; + xfs_log_item_t *dlip; + xfs_log_item_t *mlip; + + if (lip->li_flags & XFS_LI_IN_AIL) { + ailp = &(mp->m_ail); + mlip = xfs_ail_min(ailp); + dlip = xfs_ail_delete(ailp, lip); + ASSERT(dlip == lip); + + + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + mp->m_ail_gen++; + + if (mlip == dlip) { + mlip = xfs_ail_min(&(mp->m_ail)); + AIL_UNLOCK(mp, s); + xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); + } else { + AIL_UNLOCK(mp, s); + } + } + else { + /* + * If the file system is not being shutdown, we are in + * serious trouble if we get to this stage. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + AIL_UNLOCK(mp, s); + else { + xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, + "xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL"); + xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); + AIL_UNLOCK(mp, s); + } + } +} + + + +/* + * Return the item in the AIL with the smallest lsn. + * Return the current tree generation number for use + * in calls to xfs_trans_next_ail(). + */ +xfs_log_item_t * +xfs_trans_first_ail( + xfs_mount_t *mp, + int *gen) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(&(mp->m_ail)); + *gen = (int)mp->m_ail_gen; + + return (lip); +} + +/* + * If the generation count of the tree has not changed since the + * caller last took something from the AIL, then return the elmt + * in the tree which follows the one given. If the count has changed, + * then return the minimum elmt of the AIL and bump the restarts counter + * if one is given. + */ +xfs_log_item_t * +xfs_trans_next_ail( + xfs_mount_t *mp, + xfs_log_item_t *lip, + int *gen, + int *restarts) +{ + xfs_log_item_t *nlip; + + ASSERT(mp && lip && gen); + if (mp->m_ail_gen == *gen) { + nlip = xfs_ail_next(&(mp->m_ail), lip); + } else { + nlip = xfs_ail_min(&(mp->m_ail)); + *gen = (int)mp->m_ail_gen; + if (restarts != NULL) { + XFS_STATS_INC(xs_push_ail_restarts); + (*restarts)++; + } + } + + return (nlip); +} + + +/* + * The active item list (AIL) is a doubly linked list of log + * items sorted by ascending lsn. The base of the list is + * a forw/back pointer pair embedded in the xfs mount structure. + * The base is initialized with both pointers pointing to the + * base. This case always needs to be distinguished, because + * the base has no lsn to look at. We almost always insert + * at the end of the list, so on inserts we search from the + * end of the list to find where the new item belongs. + */ + +/* + * Initialize the doubly linked list to point only to itself. + */ +void +xfs_trans_ail_init( + xfs_mount_t *mp) +{ + mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail); + mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail); +} + +/* + * Insert the given log item into the AIL. + * We almost always insert at the end of the list, so on inserts + * we search from the end of the list to find where the + * new item belongs. + */ +STATIC void +xfs_ail_insert( + xfs_ail_entry_t *base, + xfs_log_item_t *lip) +/* ARGSUSED */ +{ + xfs_log_item_t *next_lip; + + /* + * If the list is empty, just insert the item. + */ + if (base->ail_back == (xfs_log_item_t*)base) { + base->ail_forw = lip; + base->ail_back = lip; + lip->li_ail.ail_forw = (xfs_log_item_t*)base; + lip->li_ail.ail_back = (xfs_log_item_t*)base; + return; + } + + next_lip = base->ail_back; + while ((next_lip != (xfs_log_item_t*)base) && + (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) { + next_lip = next_lip->li_ail.ail_back; + } + ASSERT((next_lip == (xfs_log_item_t*)base) || + (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); + lip->li_ail.ail_forw = next_lip->li_ail.ail_forw; + lip->li_ail.ail_back = next_lip; + next_lip->li_ail.ail_forw = lip; + lip->li_ail.ail_forw->li_ail.ail_back = lip; + + xfs_ail_check(base); + return; +} + +/* + * Delete the given item from the AIL. Return a pointer to the item. + */ +/*ARGSUSED*/ +STATIC xfs_log_item_t * +xfs_ail_delete( + xfs_ail_entry_t *base, + xfs_log_item_t *lip) +/* ARGSUSED */ +{ + lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back; + lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw; + lip->li_ail.ail_forw = NULL; + lip->li_ail.ail_back = NULL; + + xfs_ail_check(base); + return lip; +} + +/* + * Return a pointer to the first item in the AIL. + * If the AIL is empty, then return NULL. + */ +STATIC xfs_log_item_t * +xfs_ail_min( + xfs_ail_entry_t *base) +/* ARGSUSED */ +{ + register xfs_log_item_t *forw = base->ail_forw; + if (forw == (xfs_log_item_t*)base) { + return NULL; + } + return forw; +} + +/* + * Return a pointer to the item which follows + * the given item in the AIL. If the given item + * is the last item in the list, then return NULL. + */ +STATIC xfs_log_item_t * +xfs_ail_next( + xfs_ail_entry_t *base, + xfs_log_item_t *lip) +/* ARGSUSED */ +{ + if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) { + return NULL; + } + return lip->li_ail.ail_forw; + +} + +#ifdef DEBUG +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + xfs_ail_entry_t *base) +{ + xfs_log_item_t *lip; + xfs_log_item_t *prev_lip; + + lip = base->ail_forw; + if (lip == (xfs_log_item_t*)base) { + /* + * Make sure the pointers are correct when the list + * is empty. + */ + ASSERT(base->ail_back == (xfs_log_item_t*)base); + return; + } + + /* + * Walk the list checking forward and backward pointers, + * lsn ordering, and that every entry has the XFS_LI_IN_AIL + * flag set. + */ + prev_lip = (xfs_log_item_t*)base; + while (lip != (xfs_log_item_t*)base) { + if (prev_lip != (xfs_log_item_t*)base) { + ASSERT(prev_lip->li_ail.ail_forw == lip); + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + } + ASSERT(lip->li_ail.ail_back == prev_lip); + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = lip; + lip = lip->li_ail.ail_forw; + } + ASSERT(lip == (xfs_log_item_t*)base); + ASSERT(base->ail_back == prev_lip); +} +#endif /* DEBUG */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c new file mode 100644 index 00000000000..a9682b9510c --- /dev/null +++ b/fs/xfs/xfs_trans_buf.c @@ -0,0 +1,1093 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_rw.h" + + +STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, + xfs_daddr_t, int); +STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, + xfs_daddr_t, int); + + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it is already locked + * within the transaction, just increment its lock recursion count + * and return a pointer to it. + * + * Use the fast path function xfs_trans_buf_item_match() or the buffer + * cache routine incore_match() to find the buffer + * if it is already owned by this transaction. + * + * If we don't already own the buffer, use get_buf() to get it. + * If it doesn't yet have an associated xfs_buf_log_item structure, + * then allocate one and add the item to this transaction. + * + * If the transaction pointer is NULL, make this just a normal + * get_buf() call. + */ +xfs_buf_t * +xfs_trans_get_buf(xfs_trans_t *tp, + xfs_buftarg_t *target_dev, + xfs_daddr_t blkno, + int len, + uint flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + if (flags == 0) + flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + + /* + * Default to a normal get_buf() call if the tp is NULL. + */ + if (tp == NULL) { + bp = xfs_buf_get_flags(target_dev, blkno, len, + flags | BUF_BUSY); + return(bp); + } + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + if (tp->t_items.lic_next == NULL) { + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); + } else { + bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); + } + if (bp != NULL) { + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buftrace("TRANS GET RECUR SHUT", bp); + XFS_BUF_SUPER_STALE(bp); + } + /* + * If the buffer is stale then it was binval'ed + * since last read. This doesn't matter since the + * caller isn't allowed to use the data anyway. + */ + else if (XFS_BUF_ISSTALE(bp)) { + xfs_buftrace("TRANS GET RECUR STALE", bp); + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + } + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + xfs_buftrace("TRANS GET RECUR", bp); + xfs_buf_item_trace("GET RECUR", bip); + return (bp); + } + + /* + * We always specify the BUF_BUSY flag within a transaction so + * that get_buf does not try to push out a delayed write buffer + * which might cause another transaction to take place (if the + * buffer was delayed alloc). Such recursive transactions can + * easily deadlock with our current transaction as well as cause + * us to run out of stack space. + */ + bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY); + if (bp == NULL) { + return NULL; + } + + ASSERT(!XFS_BUF_GETERROR(bp)); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + + /* + * Set the recursion count for the buffer within this transaction + * to 0. + */ + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + + xfs_buftrace("TRANS GET", bp); + xfs_buf_item_trace("GET", bip); + return (bp); +} + +/* + * Get and lock the superblock buffer of this file system for the + * given transaction. + * + * We don't need to use incore_match() here, because the superblock + * buffer is a private buffer which we keep a pointer to in the + * mount structure. + */ +xfs_buf_t * +xfs_trans_getsb(xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + /* + * Default to just trying to lock the superblock buffer + * if tp is NULL. + */ + if (tp == NULL) { + return (xfs_getsb(mp, flags)); + } + + /* + * If the superblock buffer already has this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = mp->m_sb_bp; + if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) { + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + xfs_buf_item_trace("GETSB RECUR", bip); + return (bp); + } + + bp = xfs_getsb(mp, flags); + if (bp == NULL) { + return NULL; + } + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, mp); + + /* + * Set the recursion count for the buffer within this transaction + * to 0. + */ + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + + xfs_buf_item_trace("GETSB", bip); + return (bp); +} + +#ifdef DEBUG +xfs_buftarg_t *xfs_error_target; +int xfs_do_error; +int xfs_req_num; +int xfs_error_mod = 33; +#endif + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it has not yet been + * read in, read it from disk. If it is already locked + * within the transaction and already read in, just increment its + * lock recursion count and return a pointer to it. + * + * Use the fast path function xfs_trans_buf_item_match() or the buffer + * cache routine incore_match() to find the buffer + * if it is already owned by this transaction. + * + * If we don't already own the buffer, use read_buf() to get it. + * If it doesn't yet have an associated xfs_buf_log_item structure, + * then allocate one and add the item to this transaction. + * + * If the transaction pointer is NULL, make this just a normal + * read_buf() call. + */ +int +xfs_trans_read_buf( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len, + uint flags, + xfs_buf_t **bpp) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + int error; + + if (flags == 0) + flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; + + /* + * Default to a normal get_buf() call if the tp is NULL. + */ + if (tp == NULL) { + bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + if (!bp) + return XFS_ERROR(ENOMEM); + + if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + return error; + } +#ifdef DEBUG + if (xfs_do_error && (bp != NULL)) { + if (xfs_error_target == target) { + if (((xfs_req_num++) % xfs_error_mod) == 0) { + xfs_buf_relse(bp); + printk("Returning error!\n"); + return XFS_ERROR(EIO); + } + } + } +#endif + if (XFS_FORCED_SHUTDOWN(mp)) + goto shutdown_abort; + *bpp = bp; + return 0; + } + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. If it is already read in we just increment + * the lock recursion count and return the buffer to the caller. + * If the buffer is not yet read in, then we read it in, increment + * the lock recursion count, and return it to the caller. + */ + if (tp->t_items.lic_next == NULL) { + bp = xfs_trans_buf_item_match(tp, target, blkno, len); + } else { + bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); + } + if (bp != NULL) { + ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT((XFS_BUF_ISERROR(bp)) == 0); + if (!(XFS_BUF_ISDONE(bp))) { + xfs_buftrace("READ_BUF_INCORE !DONE", bp); + ASSERT(!XFS_BUF_ISASYNC(bp)); + XFS_BUF_READ(bp); + xfsbdstrat(tp->t_mountp, bp); + xfs_iowait(bp); + if (XFS_BUF_GETERROR(bp) != 0) { + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + error = XFS_BUF_GETERROR(bp); + xfs_buf_relse(bp); + /* + * We can gracefully recover from most + * read errors. Ones we can't are those + * that happen after the transaction's + * already dirty. + */ + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, + XFS_METADATA_IO_ERROR); + return error; + } + } + /* + * We never locked this buf ourselves, so we shouldn't + * brelse it either. Just get out. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp); + *bpp = NULL; + return XFS_ERROR(EIO); + } + + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + bip->bli_recur++; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + xfs_buf_item_trace("READ RECUR", bip); + *bpp = bp; + return 0; + } + + /* + * We always specify the BUF_BUSY flag within a transaction so + * that get_buf does not try to push out a delayed write buffer + * which might cause another transaction to take place (if the + * buffer was delayed alloc). Such recursive transactions can + * easily deadlock with our current transaction as well as cause + * us to run out of stack space. + */ + bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + if (bp == NULL) { + *bpp = NULL; + return 0; + } + if (XFS_BUF_GETERROR(bp) != 0) { + XFS_BUF_SUPER_STALE(bp); + xfs_buftrace("READ ERROR", bp); + error = XFS_BUF_GETERROR(bp); + + xfs_ioerror_alert("xfs_trans_read_buf", mp, + bp, blkno); + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR); + xfs_buf_relse(bp); + return error; + } +#ifdef DEBUG + if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) { + if (xfs_error_target == target) { + if (((xfs_req_num++) % xfs_error_mod) == 0) { + xfs_force_shutdown(tp->t_mountp, + XFS_METADATA_IO_ERROR); + xfs_buf_relse(bp); + printk("Returning error in trans!\n"); + return XFS_ERROR(EIO); + } + } + } +#endif + if (XFS_FORCED_SHUTDOWN(mp)) + goto shutdown_abort; + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + + /* + * Set the recursion count for the buffer within this transaction + * to 0. + */ + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + + xfs_buftrace("TRANS READ", bp); + xfs_buf_item_trace("READ", bip); + *bpp = bp; + return 0; + +shutdown_abort: + /* + * the theory here is that buffer is good but we're + * bailing out because the filesystem is being forcibly + * shut down. So we should leave the b_flags alone since + * the buffer's not staled and just get out. + */ +#if defined(DEBUG) + if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) + cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); +#endif + ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != + (XFS_B_STALE|XFS_B_DELWRI)); + + xfs_buftrace("READ_BUF XFSSHUTDN", bp); + xfs_buf_relse(bp); + *bpp = NULL; + return XFS_ERROR(EIO); +} + + +/* + * Release the buffer bp which was previously acquired with one of the + * xfs_trans_... buffer allocation routines if the buffer has not + * been modified within this transaction. If the buffer is modified + * within this transaction, do decrement the recursion count but do + * not release the buffer even if the count goes to 0. If the buffer is not + * modified within the transaction, decrement the recursion count and + * release the buffer if the recursion count goes to 0. + * + * If the buffer is to be released and it was not modified before + * this transaction began, then free the buf_log_item associated with it. + * + * If the transaction pointer is NULL, make this just a normal + * brelse() call. + */ +void +xfs_trans_brelse(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + xfs_log_item_t *lip; + xfs_log_item_desc_t *lidp; + + /* + * Default to a normal brelse() call if the tp is NULL. + */ + if (tp == NULL) { + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + /* + * If there's a buf log item attached to the buffer, + * then let the AIL know that the buffer is being + * unlocked. + */ + if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (lip->li_type == XFS_LI_BUF) { + bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); + xfs_trans_unlocked_item( + bip->bli_item.li_mountp, + lip); + } + } + xfs_buf_relse(bp); + return; + } + + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + /* + * Find the item descriptor pointing to this buffer's + * log item. It must be there. + */ + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); + ASSERT(lidp != NULL); + + /* + * If the release is just for a recursive lock, + * then decrement the count and return. + */ + if (bip->bli_recur > 0) { + bip->bli_recur--; + xfs_buf_item_trace("RELSE RECUR", bip); + return; + } + + /* + * If the buffer is dirty within this transaction, we can't + * release it until we commit. + */ + if (lidp->lid_flags & XFS_LID_DIRTY) { + xfs_buf_item_trace("RELSE DIRTY", bip); + return; + } + + /* + * If the buffer has been invalidated, then we can't release + * it until the transaction commits to disk unless it is re-dirtied + * as part of this transaction. This prevents us from pulling + * the item from the AIL before we should. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + xfs_buf_item_trace("RELSE STALE", bip); + return; + } + + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + xfs_buf_item_trace("RELSE", bip); + + /* + * Free up the log item descriptor tracking the released item. + */ + xfs_trans_free_item(tp, lidp); + + /* + * Clear the hold flag in the buf log item if it is set. + * We wouldn't want the next user of the buffer to + * get confused. + */ + if (bip->bli_flags & XFS_BLI_HOLD) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Drop our reference to the buf log item. + */ + atomic_dec(&bip->bli_refcount); + + /* + * If the buf item is not tracking data in the log, then + * we must free it before releasing the buffer back to the + * free pool. Before releasing the buffer to the free pool, + * clear the transaction pointer in b_fsprivate2 to dissolve + * its relation to this transaction. + */ + if (!xfs_buf_item_dirty(bip)) { +/*** + ASSERT(bp->b_pincount == 0); +***/ + ASSERT(atomic_read(&bip->bli_refcount) == 0); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); + xfs_buf_item_relse(bp); + bip = NULL; + } + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + + /* + * If we've still got a buf log item on the buffer, then + * tell the AIL that the buffer is being unlocked. + */ + if (bip != NULL) { + xfs_trans_unlocked_item(bip->bli_item.li_mountp, + (xfs_log_item_t*)bip); + } + + xfs_buf_relse(bp); + return; +} + +/* + * Add the locked buffer to the transaction. + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +void +xfs_trans_bjoin(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + + xfs_buf_item_trace("BJOIN", bip); +} + +/* + * Mark the buffer as not needing to be unlocked when the buf item's + * IOP_UNLOCK() routine is called. The buffer must already be locked + * and associated with the given transaction. + */ +/* ARGSUSED */ +void +xfs_trans_bhold(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_flags |= XFS_BLI_HOLD; + xfs_buf_item_trace("BHOLD", bip); +} + +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf(xfs_trans_t *tp, + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip; + xfs_log_item_desc_t *lidp; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); + ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) || + (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks)); + + /* + * Mark the buffer as needing to be written out eventually, + * and set its iodone function to remove the buffer's buf log + * item from the AIL and free it when the buffer is flushed + * to disk. See xfs_buf_attach_iodone() for more details + * on li_cb and xfs_buf_iodone_callbacks(). + * If we end up aborting this transaction, we trap this buffer + * inside the b_bdstrat callback so that this won't get written to + * disk. + */ + XFS_BUF_DELAYWRITE(bp); + XFS_BUF_DONE(bp); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; + + /* + * If we invalidated the buffer within this transaction, then + * cancel the invalidation now that we're dirtying the buffer + * again. There are no races with the code in xfs_buf_item_unpin(), + * because we have a reference to the buffer this entire time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + xfs_buf_item_trace("BLOG UNSTALE", bip); + bip->bli_flags &= ~XFS_BLI_STALE; + ASSERT(XFS_BUF_ISSTALE(bp)); + XFS_BUF_UNSTALE(bp); + bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + } + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; + lidp->lid_flags &= ~XFS_LID_BUF_STALE; + bip->bli_flags |= XFS_BLI_LOGGED; + xfs_buf_item_log(bip, first, last); + xfs_buf_item_trace("BLOG", bip); +} + + +/* + * This called to invalidate a buffer that is being used within + * a transaction. Typically this is because the blocks in the + * buffer are being freed, so we need to prevent it from being + * written out when we're done. Allowing it to be written again + * might overwrite data in the free blocks if they are reallocated + * to a file. + * + * We prevent the buffer from being written out by clearing the + * B_DELWRI flag. We can't always + * get rid of the buf log item at this point, though, because + * the buffer may still be pinned by another transaction. If that + * is the case, then we'll wait until the buffer is committed to + * disk for the last time (we can tell by the ref count) and + * free it in xfs_buf_item_unpin(). Until it is cleaned up we + * will keep the buffer locked so that the buffer and buf log item + * are not reused. + */ +void +xfs_trans_binval( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); + ASSERT(lidp != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * If the buffer is already invalidated, then + * just return. + */ + ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(lidp->lid_flags & XFS_LID_DIRTY); + ASSERT(tp->t_flags & XFS_TRANS_DIRTY); + xfs_buftrace("XFS_BINVAL RECUR", bp); + xfs_buf_item_trace("BINVAL RECUR", bip); + return; + } + + /* + * Clear the dirty bit in the buffer and set the STALE flag + * in the buf log item. The STALE flag will be used in + * xfs_buf_item_unpin() to determine if it should clean up + * when the last reference to the buf item is given up. + * We set the XFS_BLI_CANCEL flag in the buf log format structure + * and log the buf item. This will be used at recovery time + * to determine that copies of the buffer in the log before + * this should not be replayed. + * We mark the item descriptor and the transaction dirty so + * that we'll hold the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state + * about which parts of the buffer have been logged. We also + * clear the flag indicating that this is an inode buffer since + * the data in the buffer will no longer be valid. + * + * We set the stale bit in the buffer as well since we're getting + * rid of it. + */ + XFS_BUF_UNDELAYWRITE(bp); + XFS_BUF_STALE(bp); + bip->bli_flags |= XFS_BLI_STALE; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + memset((char *)(bip->bli_format.blf_data_map), 0, + (bip->bli_format.blf_map_size * sizeof(uint))); + lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_buftrace("XFS_BINVAL", bp); + xfs_buf_item_trace("BINVAL", bip); +} + +/* + * This call is used to indicate that the buffer contains on-disk + * inodes which must be handled specially during recovery. They + * require special handling because only the di_next_unlinked from + * the inodes in the buffer should be recovered. The rest of the + * data in the buffer is logged via the inodes themselves. + * + * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log + * format structure so that we'll know what to do at recovery time. + */ +/* ARGSUSED */ +void +xfs_trans_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; +} + +/* + * This call is used to indicate that the buffer is going to + * be staled and was an inode buffer. This means it gets + * special processing during unpin - where any inodes + * associated with the buffer should be removed from ail. + * There is also special processing during recovery, + * any replay of the inodes in the buffer needs to be + * prevented as the buffer may have been reused. + */ +void +xfs_trans_stale_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_STALE_INODE; + bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) + xfs_buf_iodone; +} + + + +/* + * Mark the buffer as being one which contains newly allocated + * inodes. We need to make sure that even if this buffer is + * relogged as an 'inode buf' we still recover all of the inode + * images in the face of a crash. This works in coordination with + * xfs_buf_item_committed() to ensure that the buffer remains in the + * AIL at its original location even after it has been relogged. + */ +/* ARGSUSED */ +void +xfs_trans_inode_alloc_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; +} + + +/* + * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of + * dquots. However, unlike in inode buffer recovery, dquot buffers get + * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). + * The only thing that makes dquot buffers different from regular + * buffers is that we must not replay dquot bufs when recovering + * if a _corresponding_ quotaoff has happened. We also have to distinguish + * between usr dquot bufs and grp dquot bufs, because usr and grp quotas + * can be turned off independently. + */ +/* ARGSUSED */ +void +xfs_trans_dquot_buf( + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + ASSERT(type == XFS_BLI_UDQUOT_BUF || + type == XFS_BLI_GDQUOT_BUF); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_format.blf_flags |= type; +} + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. Only check the first, embedded + * chunk, since we don't want to spend all day scanning large transactions. + */ +STATIC xfs_buf_t * +xfs_trans_buf_item_match( + xfs_trans_t *tp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + xfs_buf_t *bp; + int i; + + bp = NULL; + len = BBTOB(len); + licp = &tp->t_items; + if (!XFS_LIC_ARE_ALL_FREE(licp)) { + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lidp = XFS_LIC_SLOT(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) { + continue; + } + + bp = blip->bli_buf; + if ((XFS_BUF_TARGET(bp) == target) && + (XFS_BUF_ADDR(bp) == blkno) && + (XFS_BUF_COUNT(bp) == len)) { + /* + * We found it. Break out and + * return the pointer to the buffer. + */ + break; + } else { + bp = NULL; + } + } + } + return bp; +} + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. Check all the chunks, we + * want to be thorough. + */ +STATIC xfs_buf_t * +xfs_trans_buf_item_match_all( + xfs_trans_t *tp, + xfs_buftarg_t *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + xfs_buf_t *bp; + int i; + + bp = NULL; + len = BBTOB(len); + for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { + if (XFS_LIC_ARE_ALL_FREE(licp)) { + ASSERT(licp == &tp->t_items); + ASSERT(licp->lic_next == NULL); + return NULL; + } + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + lidp = XFS_LIC_SLOT(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) { + continue; + } + + bp = blip->bli_buf; + if ((XFS_BUF_TARGET(bp) == target) && + (XFS_BUF_ADDR(bp) == blkno) && + (XFS_BUF_COUNT(bp) == len)) { + /* + * We found it. Break out and + * return the pointer to the buffer. + */ + return bp; + } + } + } + return NULL; +} diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c new file mode 100644 index 00000000000..93259a15f98 --- /dev/null +++ b/fs/xfs/xfs_trans_extfree.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + +/* + * This routine is called to allocate an "extent free intention" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efi_log_item_t * +xfs_trans_get_efi(xfs_trans_t *tp, + uint nextents) +{ + xfs_efi_log_item_t *efip; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efip = xfs_efi_init(tp->t_mountp, nextents); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); + + return (efip); +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as needing to be freed. It should + * be called once for each extent to be freed. + */ +void +xfs_trans_log_efi_extent(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + xfs_log_item_desc_t *lidp; + uint next_extent; + xfs_extent_t *extp; + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; + + next_extent = efip->efi_next_extent; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &(efip->efi_format.efi_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efip->efi_next_extent++; +} + + +/* + * This routine is called to allocate an "extent free done" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efd_log_item_t * +xfs_trans_get_efd(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + uint nextents) +{ + xfs_efd_log_item_t *efdp; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efdp = xfs_efd_init(tp->t_mountp, efip, nextents); + ASSERT(efdp != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); + + return (efdp); +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as having been freed. It should + * be called once for each extent freed. + */ +void +xfs_trans_log_efd_extent(xfs_trans_t *tp, + xfs_efd_log_item_t *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + xfs_log_item_desc_t *lidp; + uint next_extent; + xfs_extent_t *extp; + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efdp->efd_next_extent++; +} diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c new file mode 100644 index 00000000000..e2c3706f453 --- /dev/null +++ b/fs/xfs/xfs_trans_inode.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" + +#ifdef XFS_TRANS_DEBUG +STATIC void +xfs_trans_inode_broot_debug( + xfs_inode_t *ip); +#else +#define xfs_trans_inode_broot_debug(ip) +#endif + + +/* + * Get and lock the inode for the caller if it is not already + * locked within the given transaction. If it is already locked + * within the transaction, just increment its lock recursion count + * and return a pointer to it. + * + * For an inode to be locked in a transaction, the inode lock, as + * opposed to the io lock, must be taken exclusively. This ensures + * that the inode can be involved in only 1 transaction at a time. + * Lock recursion is handled on the io lock, but only for lock modes + * of equal or lesser strength. That is, you can recur on the io lock + * held EXCL with a SHARED request but not vice versa. Also, if + * the inode is already a part of the transaction then you cannot + * go from not holding the io lock to having it EXCL or SHARED. + * + * Use the inode cache routine xfs_inode_incore() to find the inode + * if it is already owned by this transaction. + * + * If we don't already own the inode, use xfs_iget() to get it. + * Since the inode log item structure is embedded in the incore + * inode structure and is initialized when the inode is brought + * into memory, there is nothing to do with it here. + * + * If the given transaction pointer is NULL, just call xfs_iget(). + * This simplifies code which must handle both cases. + */ +int +xfs_trans_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + int error; + xfs_inode_t *ip; + xfs_inode_log_item_t *iip; + + /* + * If the transaction pointer is NULL, just call the normal + * xfs_iget(). + */ + if (tp == NULL) + return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0); + + /* + * If we find the inode in core with this transaction + * pointer in its i_transp field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the inode to the caller. + * Assert that the inode is already locked in the mode requested + * by the caller. We cannot do lock promotions yet, so + * die if someone gets this wrong. + */ + if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) { + /* + * Make sure that the inode lock is held EXCL and + * that the io lock is never upgraded when the inode + * is already a part of the transaction. + */ + ASSERT(ip->i_itemp != NULL); + ASSERT(lock_flags & XFS_ILOCK_EXCL); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || + ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || + (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); + ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || + ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS))); + ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || + (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); + + if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { + ip->i_itemp->ili_iolock_recur++; + } + if (lock_flags & XFS_ILOCK_EXCL) { + ip->i_itemp->ili_ilock_recur++; + } + *ipp = ip; + return 0; + } + + ASSERT(lock_flags & XFS_ILOCK_EXCL); + error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0); + if (error) { + return error; + } + ASSERT(ip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + if (ip->i_itemp == NULL) + xfs_inode_item_init(ip, mp); + iip = ip->i_itemp; + (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip)); + + xfs_trans_inode_broot_debug(ip); + + /* + * If the IO lock has been acquired, mark that in + * the inode log item so we'll know to unlock it + * when the transaction commits. + */ + ASSERT(iip->ili_flags == 0); + if (lock_flags & XFS_IOLOCK_EXCL) { + iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; + } + + /* + * Initialize i_transp so we can find it with xfs_inode_incore() + * above. + */ + ip->i_transp = tp; + + *ipp = ip; + return 0; +} + +/* + * Add the locked inode to the transaction. + * The inode must be locked, and it cannot be associated with any + * transaction. The caller must specify the locks already held + * on the inode. + */ +void +xfs_trans_ijoin( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint lock_flags) +{ + xfs_inode_log_item_t *iip; + + ASSERT(ip->i_transp == NULL); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(lock_flags & XFS_ILOCK_EXCL); + if (ip->i_itemp == NULL) + xfs_inode_item_init(ip, ip->i_mount); + iip = ip->i_itemp; + ASSERT(iip->ili_flags == 0); + ASSERT(iip->ili_ilock_recur == 0); + ASSERT(iip->ili_iolock_recur == 0); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); + + xfs_trans_inode_broot_debug(ip); + + /* + * If the IO lock is already held, mark that in the inode log item. + */ + if (lock_flags & XFS_IOLOCK_EXCL) { + iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; + } + + /* + * Initialize i_transp so we can find it with xfs_inode_incore() + * in xfs_trans_iget() above. + */ + ip->i_transp = tp; +} + + + +/* + * Mark the inode as not needing to be unlocked when the inode item's + * IOP_UNLOCK() routine is called. The inode must already be locked + * and associated with the given transaction. + */ +/*ARGSUSED*/ +void +xfs_trans_ihold( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + + ip->i_itemp->ili_flags |= XFS_ILI_HOLD; +} + +/* + * Cancel the previous inode hold request made on this inode + * for this transaction. + */ +/*ARGSUSED*/ +void +xfs_trans_ihold_release( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); + + ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD; +} + + +/* + * This is called to mark the fields indicated in fieldmask as needing + * to be logged when the transaction is committed. The inode must + * already be associated with the given transaction. + * + * The values for fieldmask are defined in xfs_inode_item.h. We always + * log all of the core inode if any of it has changed, and we always log + * all of the inline data/extents/b-tree root if any of them has changed. + */ +void +xfs_trans_log_inode( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + ASSERT(ip->i_transp == tp); + ASSERT(ip->i_itemp != NULL); + ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + + lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); + ASSERT(lidp != NULL); + + tp->t_flags |= XFS_TRANS_DIRTY; + lidp->lid_flags |= XFS_LID_DIRTY; + + /* + * Always OR in the bits from the ili_last_fields field. + * This is to coordinate with the xfs_iflush() and xfs_iflush_done() + * routines in the eventual clearing of the ilf_fields bits. + * See the big comment in xfs_iflush() for an explanation of + * this coorination mechanism. + */ + flags |= ip->i_itemp->ili_last_fields; + ip->i_itemp->ili_format.ilf_fields |= flags; +} + +#ifdef XFS_TRANS_DEBUG +/* + * Keep track of the state of the inode btree root to make sure we + * log it properly. + */ +STATIC void +xfs_trans_inode_broot_debug( + xfs_inode_t *ip) +{ + xfs_inode_log_item_t *iip; + + ASSERT(ip->i_itemp != NULL); + iip = ip->i_itemp; + if (iip->ili_root_size != 0) { + ASSERT(iip->ili_orig_root != NULL); + kmem_free(iip->ili_orig_root, iip->ili_root_size); + iip->ili_root_size = 0; + iip->ili_orig_root = NULL; + } + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ASSERT((ip->i_df.if_broot != NULL) && + (ip->i_df.if_broot_bytes > 0)); + iip->ili_root_size = ip->i_df.if_broot_bytes; + iip->ili_orig_root = + (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); + memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), + iip->ili_root_size); + } +} +#endif diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c new file mode 100644 index 00000000000..1b8a756d80e --- /dev/null +++ b/fs/xfs/xfs_trans_item.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" + +STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, + int, int, xfs_lsn_t); + +/* + * This is called to add the given log item to the transaction's + * list of log items. It must find a free log item descriptor + * or allocate a new one and add the item to that descriptor. + * The function returns a pointer to item descriptor used to point + * to the new item. The log item will now point to its new descriptor + * with its li_desc field. + */ +xfs_log_item_desc_t * +xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_chunk_t *licp; + int i=0; + + /* + * If there are no free descriptors, allocate a new chunk + * of them and put it at the front of the chunk list. + */ + if (tp->t_items_free == 0) { + licp = (xfs_log_item_chunk_t*) + kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP); + ASSERT(licp != NULL); + /* + * Initialize the chunk, and then + * claim the first slot in the newly allocated chunk. + */ + XFS_LIC_INIT(licp); + XFS_LIC_CLAIM(licp, 0); + licp->lic_unused = 1; + XFS_LIC_INIT_SLOT(licp, 0); + lidp = XFS_LIC_SLOT(licp, 0); + + /* + * Link in the new chunk and update the free count. + */ + licp->lic_next = tp->t_items.lic_next; + tp->t_items.lic_next = licp; + tp->t_items_free = XFS_LIC_NUM_SLOTS - 1; + + /* + * Initialize the descriptor and the generic portion + * of the log item. + * + * Point the new slot at this item and return it. + * Also point the log item at its currently active + * descriptor and set the item's mount pointer. + */ + lidp->lid_item = lip; + lidp->lid_flags = 0; + lidp->lid_size = 0; + lip->li_desc = lidp; + lip->li_mountp = tp->t_mountp; + return (lidp); + } + + /* + * Find the free descriptor. It is somewhere in the chunklist + * of descriptors. + */ + licp = &tp->t_items; + while (licp != NULL) { + if (XFS_LIC_VACANCY(licp)) { + if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { + i = licp->lic_unused; + ASSERT(XFS_LIC_ISFREE(licp, i)); + break; + } + for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { + if (XFS_LIC_ISFREE(licp, i)) + break; + } + ASSERT(i <= XFS_LIC_MAX_SLOT); + break; + } + licp = licp->lic_next; + } + ASSERT(licp != NULL); + /* + * If we find a free descriptor, claim it, + * initialize it, and return it. + */ + XFS_LIC_CLAIM(licp, i); + if (licp->lic_unused <= i) { + licp->lic_unused = i + 1; + XFS_LIC_INIT_SLOT(licp, i); + } + lidp = XFS_LIC_SLOT(licp, i); + tp->t_items_free--; + lidp->lid_item = lip; + lidp->lid_flags = 0; + lidp->lid_size = 0; + lip->li_desc = lidp; + lip->li_mountp = tp->t_mountp; + return (lidp); +} + +/* + * Free the given descriptor. + * + * This requires setting the bit in the chunk's free mask corresponding + * to the given slot. + */ +void +xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) +{ + uint slot; + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t **licpp; + + slot = XFS_LIC_DESC_TO_SLOT(lidp); + licp = XFS_LIC_DESC_TO_CHUNK(lidp); + XFS_LIC_RELSE(licp, slot); + lidp->lid_item->li_desc = NULL; + tp->t_items_free++; + + /* + * If there are no more used items in the chunk and this is not + * the chunk embedded in the transaction structure, then free + * the chunk. First pull it from the chunk list and then + * free it back to the heap. We didn't bother with a doubly + * linked list here because the lists should be very short + * and this is not a performance path. It's better to save + * the memory of the extra pointer. + * + * Also decrement the transaction structure's count of free items + * by the number in a chunk since we are freeing an empty chunk. + */ + if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) { + licpp = &(tp->t_items.lic_next); + while (*licpp != licp) { + ASSERT(*licpp != NULL); + licpp = &((*licpp)->lic_next); + } + *licpp = licp->lic_next; + kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + tp->t_items_free -= XFS_LIC_NUM_SLOTS; + } +} + +/* + * This is called to find the descriptor corresponding to the given + * log item. It returns a pointer to the descriptor. + * The log item MUST have a corresponding descriptor in the given + * transaction. This routine does not return NULL, it panics. + * + * The descriptor pointer is kept in the log item's li_desc field. + * Just return it. + */ +/*ARGSUSED*/ +xfs_log_item_desc_t * +xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip) +{ + ASSERT(lip->li_desc != NULL); + + return (lip->li_desc); +} + + +/* + * Return a pointer to the first descriptor in the chunk list. + * This does not return NULL if there are none, it panics. + * + * The first descriptor must be in either the first or second chunk. + * This is because the only chunk allowed to be empty is the first. + * All others are freed when they become empty. + * + * At some point this and xfs_trans_next_item() should be optimized + * to quickly look at the mask to determine if there is anything to + * look at. + */ +xfs_log_item_desc_t * +xfs_trans_first_item(xfs_trans_t *tp) +{ + xfs_log_item_chunk_t *licp; + int i; + + licp = &tp->t_items; + /* + * If it's not in the first chunk, skip to the second. + */ + if (XFS_LIC_ARE_ALL_FREE(licp)) { + licp = licp->lic_next; + } + + /* + * Return the first non-free descriptor in the chunk. + */ + ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + for (i = 0; i < licp->lic_unused; i++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + return (XFS_LIC_SLOT(licp, i)); + } + cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); + return(NULL); +} + + +/* + * Given a descriptor, return the next descriptor in the chunk list. + * This returns NULL if there are no more used descriptors in the list. + * + * We do this by first locating the chunk in which the descriptor resides, + * and then scanning forward in the chunk and the list for the next + * used descriptor. + */ +/*ARGSUSED*/ +xfs_log_item_desc_t * +xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) +{ + xfs_log_item_chunk_t *licp; + int i; + + licp = XFS_LIC_DESC_TO_CHUNK(lidp); + + /* + * First search the rest of the chunk. The for loop keeps us + * from referencing things beyond the end of the chunk. + */ + for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + return (XFS_LIC_SLOT(licp, i)); + } + + /* + * Now search the next chunk. It must be there, because the + * next chunk would have been freed if it were empty. + * If there is no next chunk, return NULL. + */ + if (licp->lic_next == NULL) { + return (NULL); + } + + licp = licp->lic_next; + ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + for (i = 0; i < licp->lic_unused; i++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + + return (XFS_LIC_SLOT(licp, i)); + } + ASSERT(0); + /* NOTREACHED */ + return NULL; /* keep gcc quite */ +} + +/* + * This is called to unlock all of the items of a transaction and to free + * all the descriptors of that transaction. + * + * It walks the list of descriptors and unlocks each item. It frees + * each chunk except that embedded in the transaction as it goes along. + */ +void +xfs_trans_free_items( + xfs_trans_t *tp, + int flags) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + int abort; + + abort = flags & XFS_TRANS_ABORT; + licp = &tp->t_items; + /* + * Special case the embedded chunk so we don't free it below. + */ + if (!XFS_LIC_ARE_ALL_FREE(licp)) { + (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + XFS_LIC_ALL_FREE(licp); + licp->lic_unused = 0; + } + licp = licp->lic_next; + + /* + * Unlock each item in each chunk and free the chunks. + */ + while (licp != NULL) { + ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + next_licp = licp->lic_next; + kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + licp = next_licp; + } + + /* + * Reset the transaction structure's free item count. + */ + tp->t_items_free = XFS_LIC_NUM_SLOTS; + tp->t_items.lic_next = NULL; +} + + + +/* + * This is called to unlock the items associated with a transaction. + * Items which were not logged should be freed. + * Those which were logged must still be tracked so they can be unpinned + * when the transaction commits. + */ +void +xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + xfs_log_item_chunk_t **licpp; + int freed; + + freed = 0; + licp = &tp->t_items; + + /* + * Special case the embedded chunk so we don't free. + */ + if (!XFS_LIC_ARE_ALL_FREE(licp)) { + freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); + } + licpp = &(tp->t_items.lic_next); + licp = licp->lic_next; + + /* + * Unlock each item in each chunk, free non-dirty descriptors, + * and free empty chunks. + */ + while (licp != NULL) { + ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); + freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); + next_licp = licp->lic_next; + if (XFS_LIC_ARE_ALL_FREE(licp)) { + *licpp = next_licp; + kmem_free(licp, sizeof(xfs_log_item_chunk_t)); + freed -= XFS_LIC_NUM_SLOTS; + } else { + licpp = &(licp->lic_next); + } + ASSERT(*licpp == next_licp); + licp = next_licp; + } + + /* + * Fix the free descriptor count in the transaction. + */ + tp->t_items_free += freed; +} + +/* + * Unlock each item pointed to by a descriptor in the given chunk. + * Stamp the commit lsn into each item if necessary. + * Free descriptors pointing to items which are not dirty if freeing_chunk + * is zero. If freeing_chunk is non-zero, then we need to unlock all + * items in the chunk. + * + * Return the number of descriptors freed. + */ +STATIC int +xfs_trans_unlock_chunk( + xfs_log_item_chunk_t *licp, + int freeing_chunk, + int abort, + xfs_lsn_t commit_lsn) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_t *lip; + int i; + int freed; + + freed = 0; + lidp = licp->lic_descs; + for (i = 0; i < licp->lic_unused; i++, lidp++) { + if (XFS_LIC_ISFREE(licp, i)) { + continue; + } + lip = lidp->lid_item; + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + if (abort) + lip->li_flags |= XFS_LI_ABORTED; + IOP_UNLOCK(lip); + + /* + * Free the descriptor if the item is not dirty + * within this transaction and the caller is not + * going to just free the entire thing regardless. + */ + if (!(freeing_chunk) && + (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { + XFS_LIC_RELSE(licp, i); + freed++; + } + } + + return (freed); +} + + +/* + * This is called to add the given busy item to the transaction's + * list of busy items. It must find a free busy item descriptor + * or allocate a new one and add the item to that descriptor. + * The function returns a pointer to busy descriptor used to point + * to the new busy entry. The log busy entry will now point to its new + * descriptor with its ???? field. + */ +xfs_log_busy_slot_t * +xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) +{ + xfs_log_busy_chunk_t *lbcp; + xfs_log_busy_slot_t *lbsp; + int i=0; + + /* + * If there are no free descriptors, allocate a new chunk + * of them and put it at the front of the chunk list. + */ + if (tp->t_busy_free == 0) { + lbcp = (xfs_log_busy_chunk_t*) + kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); + ASSERT(lbcp != NULL); + /* + * Initialize the chunk, and then + * claim the first slot in the newly allocated chunk. + */ + XFS_LBC_INIT(lbcp); + XFS_LBC_CLAIM(lbcp, 0); + lbcp->lbc_unused = 1; + lbsp = XFS_LBC_SLOT(lbcp, 0); + + /* + * Link in the new chunk and update the free count. + */ + lbcp->lbc_next = tp->t_busy.lbc_next; + tp->t_busy.lbc_next = lbcp; + tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; + + /* + * Initialize the descriptor and the generic portion + * of the log item. + * + * Point the new slot at this item and return it. + * Also point the log item at its currently active + * descriptor and set the item's mount pointer. + */ + lbsp->lbc_ag = ag; + lbsp->lbc_idx = idx; + return (lbsp); + } + + /* + * Find the free descriptor. It is somewhere in the chunklist + * of descriptors. + */ + lbcp = &tp->t_busy; + while (lbcp != NULL) { + if (XFS_LBC_VACANCY(lbcp)) { + if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { + i = lbcp->lbc_unused; + break; + } else { + /* out-of-order vacancy */ + printk("OOO vacancy lbcp 0x%p\n", lbcp); + ASSERT(0); + } + } + lbcp = lbcp->lbc_next; + } + ASSERT(lbcp != NULL); + /* + * If we find a free descriptor, claim it, + * initialize it, and return it. + */ + XFS_LBC_CLAIM(lbcp, i); + if (lbcp->lbc_unused <= i) { + lbcp->lbc_unused = i + 1; + } + lbsp = XFS_LBC_SLOT(lbcp, i); + tp->t_busy_free--; + lbsp->lbc_ag = ag; + lbsp->lbc_idx = idx; + return (lbsp); +} + + +/* + * xfs_trans_free_busy + * Free all of the busy lists from a transaction + */ +void +xfs_trans_free_busy(xfs_trans_t *tp) +{ + xfs_log_busy_chunk_t *lbcp; + xfs_log_busy_chunk_t *lbcq; + + lbcp = tp->t_busy.lbc_next; + while (lbcp != NULL) { + lbcq = lbcp->lbc_next; + kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t)); + lbcp = lbcq; + } + + XFS_LBC_INIT(&tp->t_busy); + tp->t_busy.lbc_unused = 0; +} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h new file mode 100644 index 00000000000..d4dae7d06af --- /dev/null +++ b/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; + +/* + * From xfs_trans_item.c + */ +struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *, + struct xfs_log_item *); +void xfs_trans_free_item(struct xfs_trans *, + struct xfs_log_item_desc *); +struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, + struct xfs_log_item *); +struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); +struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, + struct xfs_log_item_desc *); +void xfs_trans_free_items(struct xfs_trans *, int); +void xfs_trans_unlock_items(struct xfs_trans *, + xfs_lsn_t); +void xfs_trans_free_busy(xfs_trans_t *tp); +xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, + xfs_agnumber_t ag, + xfs_extlen_t idx); + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_update_ail(struct xfs_mount *, + struct xfs_log_item *, xfs_lsn_t, + unsigned long); +void xfs_trans_delete_ail(struct xfs_mount *, + struct xfs_log_item *, unsigned long); +struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *); +struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *, + struct xfs_log_item *, int *, int *); + + +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h new file mode 100644 index 00000000000..e91d173f4ed --- /dev/null +++ b/fs/xfs/xfs_trans_space.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_TRANS_SPACE_H__ +#define __XFS_TRANS_SPACE_H__ + +/* + * Components of space reservations. + */ +#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ + (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) +#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) +#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_DBS(mp,w) \ + (XFS_DA_NODE_MAXDEPTH + \ + ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0)) +#define XFS_DAENTER_BLOCKS(mp,w) \ + (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w)) +#define XFS_DAENTER_BMAP1B(mp,w) \ + XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w) +#define XFS_DAENTER_BMAPS(mp,w) \ + (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w)) +#define XFS_DAENTER_SPACE_RES(mp,w) \ + (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w)) +#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w) +#define XFS_DIRENTER_MAX_SPLIT(mp,nl) \ + (((mp)->m_sb.sb_blocksize == 512 && \ + XFS_DIR_IS_V1(mp) && \ + (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1) +#define XFS_DIRENTER_SPACE_RES(mp,nl) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \ + XFS_DIRENTER_MAX_SPLIT(mp,nl)) +#define XFS_DIRREMOVE_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) +#define XFS_IALLOC_SPACE_RES(mp) \ + (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) + +/* + * Space reservation values for various transactions. + */ +#define XFS_ADDAFORK_SPACE_RES(mp) \ + ((mp)->m_dirblkfsbs + \ + (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))) +#define XFS_ATTRRM_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) +/* This macro is not used - see inline code in xfs_attr_set */ +#define XFS_ATTRSET_SPACE_RES(mp, v) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) +#define XFS_CREATE_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_DIOSTRAT_SPACE_RES(mp, v) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) +#define XFS_GROWFS_SPACE_RES(mp) \ + (2 * XFS_AG_MAXLEVELS(mp)) +#define XFS_GROWFSRT_SPACE_RES(mp,b) \ + ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) +#define XFS_LINK_SPACE_RES(mp,nl) \ + XFS_DIRENTER_SPACE_RES(mp,nl) +#define XFS_MKDIR_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_QM_DQALLOC_SPACE_RES(mp) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ + XFS_DQUOT_CLUSTER_SIZE_FSB) +#define XFS_QM_QINOCREATE_SPACE_RES(mp) \ + XFS_IALLOC_SPACE_RES(mp) +#define XFS_REMOVE_SPACE_RES(mp) \ + XFS_DIRREMOVE_SPACE_RES(mp) +#define XFS_RENAME_SPACE_RES(mp,nl) \ + (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) + +#endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h new file mode 100644 index 00000000000..04609d27ea5 --- /dev/null +++ b/fs/xfs/xfs_types.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_TYPES_H__ +#define __XFS_TYPES_H__ + +#ifdef __KERNEL__ + +/* + * POSIX Extensions + */ +typedef unsigned char uchar_t; +typedef unsigned short ushort_t; +typedef unsigned int uint_t; +typedef unsigned long ulong_t; + +/* + * Additional type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef enum { B_FALSE,B_TRUE } boolean_t; +typedef __int64_t prid_t; /* project ID */ +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* type */ +typedef __u64 xfs_ino_t; /* type */ +typedef __s64 xfs_daddr_t; /* type */ +typedef char * xfs_caddr_t; /* type */ +typedef __u32 xfs_dev_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + +#endif /* __KERNEL__ */ + +typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef __uint32_t xfs_agnumber_t; /* allocation group number */ +typedef __int32_t xfs_extnum_t; /* # of extents in a file */ +typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef __int64_t xfs_fsize_t; /* bytes in a file */ +typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ + +typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ + +typedef __int64_t xfs_lsn_t; /* log sequence number */ +typedef __int32_t xfs_tid_t; /* transaction identifier */ + +typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ + +typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ + +/* + * These types are 64 bits on disk but are either 32 or 64 bits in memory. + * Disk based types: + */ +typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */ +typedef __uint64_t xfs_dfiloff_t; /* block number in a file */ +typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */ + +/* + * Memory based types are conditional. + */ +#if XFS_BIG_BLKNOS +typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +#else +typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +#endif +typedef __uint64_t xfs_fileoff_t; /* block number in a file */ +typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ +typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + +typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ + +/* + * Null values for the types. + */ +#define NULLDFSBNO ((xfs_dfsbno_t)-1) +#define NULLDRFSBNO ((xfs_drfsbno_t)-1) +#define NULLDRTBNO ((xfs_drtbno_t)-1) +#define NULLDFILOFF ((xfs_dfiloff_t)-1) + +#define NULLFSBLOCK ((xfs_fsblock_t)-1) +#define NULLRFSBLOCK ((xfs_rfsblock_t)-1) +#define NULLRTBLOCK ((xfs_rtblock_t)-1) +#define NULLFILEOFF ((xfs_fileoff_t)-1) + +#define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLEXTNUM ((xfs_extnum_t)-1) + +#define NULLCOMMITLSN ((xfs_lsn_t)-1) + +/* + * Max values for extlen, extnum, aextnum. + */ +#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ +#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ +#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ + +/* + * MAXNAMELEN is the length (including the terminating null) of + * the longest permissible file (component) name. + */ +#define MAXNAMELEN 256 + +typedef struct xfs_dirent { /* data from readdir() */ + xfs_ino_t d_ino; /* inode number of entry */ + xfs_off_t d_off; /* offset of disk directory entry */ + unsigned short d_reclen; /* length of this record */ + char d_name[1]; /* name of file */ +} xfs_dirent_t; + +#define DIRENTBASESIZE (((xfs_dirent_t *)0)->d_name - (char *)0) +#define DIRENTSIZE(namelen) \ + ((DIRENTBASESIZE + (namelen) + \ + sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1)) + +typedef enum { + XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi +} xfs_lookup_t; + +typedef enum { + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, + XFS_BTNUM_MAX +} xfs_btnum_t; + +#endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c new file mode 100644 index 00000000000..816b945fa0e --- /dev/null +++ b/fs/xfs/xfs_utils.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_rw.h" +#include "xfs_itable.h" +#include "xfs_utils.h" + +/* + * xfs_get_dir_entry is used to get a reference to an inode given + * its parent directory inode and the name of the file. It does + * not lock the child inode, and it unlocks the directory before + * returning. The directory's generation number is returned for + * use by a later call to xfs_lock_dir_and_entry. + */ +int +xfs_get_dir_entry( + vname_t *dentry, + xfs_inode_t **ipp) +{ + vnode_t *vp; + bhv_desc_t *bdp; + + vp = VNAME_TO_VNODE(dentry); + bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); + if (!bdp) { + *ipp = NULL; + return XFS_ERROR(ENOENT); + } + VN_HOLD(vp); + *ipp = XFS_BHVTOI(bdp); + return 0; +} + +int +xfs_dir_lookup_int( + bhv_desc_t *dir_bdp, + uint lock_mode, + vname_t *dentry, + xfs_ino_t *inum, + xfs_inode_t **ipp) +{ + vnode_t *dir_vp; + xfs_inode_t *dp; + int error; + + dir_vp = BHV_TO_VNODE(dir_bdp); + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + dp = XFS_BHVTOI(dir_bdp); + + error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp, + VNAME(dentry), VNAMELEN(dentry), inum); + if (!error) { + /* + * Unlock the directory. We do this because we can't + * hold the directory lock while doing the vn_get() + * in xfs_iget(). Doing so could cause us to hold + * a lock while waiting for the inode to finish + * being inactive while it's waiting for a log + * reservation in the inactive routine. + */ + xfs_iunlock(dp, lock_mode); + error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0); + xfs_ilock(dp, lock_mode); + + if (error) { + *ipp = NULL; + } else if ((*ipp)->i_d.di_mode == 0) { + /* + * The inode has been freed. Something is + * wrong so just get out of here. + */ + xfs_iunlock(dp, lock_mode); + xfs_iput_new(*ipp, 0); + *ipp = NULL; + xfs_ilock(dp, lock_mode); + error = XFS_ERROR(ENOENT); + } + } + return error; +} + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + mode_t mode, + nlink_t nlink, + xfs_dev_t rdev, + cred_t *credp, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + boolean_t call_again = B_FALSE; + int code; + uint log_res; + uint log_count; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, + &ialloc_context, &call_again, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!call_again && (ip == NULL)) { + *ipp = NULL; + return XFS_ERROR(ENOSPC); + } + + /* + * If call_again is set, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (call_again) { + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + log_res = xfs_trans_get_log_res(tp); + log_count = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0, NULL); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + code = xfs_trans_reserve(tp, 0, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, + okalloc, &ialloc_context, &call_again, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT ((!call_again) && (ip != NULL)); + + } else { + if (committed != NULL) { + *committed = 0; + } + } + + *ipp = ip; + *tpp = tp; + + return 0; +} + +/* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * This gets called when the inode's version needs to be changed from 1 to 2. + * Currently this happens when the nlink field overflows the old 16-bit value + * or when chproj is called to change the project for the first time. + * As a side effect the superblock version will also get rev'd + * to contain the NLINK bit. + */ +void +xfs_bump_ino_vers2( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + unsigned long s; + + ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); + + ip->i_d.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + mp = tp->t_mountp; + if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + s = XFS_SB_LOCK(mp); + if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { + XFS_SB_VERSION_ADDNLINK(&mp->m_sb); + XFS_SB_UNLOCK(mp, s); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM); + } else { + XFS_SB_UNLOCK(mp, s); + } + } + /* Caller must log the inode */ +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + if (ip->i_d.di_nlink >= XFS_MAXLINK) + return XFS_ERROR(EMLINK); + xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_nlink > 0); + ip->i_d.di_nlink++; + if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && + (ip->i_d.di_nlink > XFS_MAXLINK_1)) { + /* + * The inode has increased its number of links beyond + * what can fit in an old format inode. It now needs + * to be converted to a version 2 inode with a 32 bit + * link count. If this is the first inode in the file + * system to do this, then we need to bump the superblock + * version number as well. + */ + xfs_bump_ino_vers2(tp, ip); + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +/* + * Try to truncate the given file to 0 length. Currently called + * only out of xfs_remove when it has to truncate a file to free + * up space for the remove to proceed. + */ +int +xfs_truncate_file( + xfs_mount_t *mp, + xfs_inode_t *ip) +{ + xfs_trans_t *tp; + int error; + +#ifdef QUOTADEBUG + /* + * This is called to truncate the quotainodes too. + */ + if (XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_ino != mp->m_sb.sb_uquotino) + ASSERT(ip->i_udquot); + } + if (XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_ino != mp->m_sb.sb_gquotino) + ASSERT(ip->i_gdquot); + } +#endif + /* + * Make the call to xfs_itruncate_start before starting the + * transaction, because we cannot make the call while we're + * in a transaction. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + /* + * Follow the normal truncate locking protocol. Since we + * hold the inode in the transaction, we know that it's number + * of references will stay constant. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(tp, ip); + /* + * Signal a sync xaction. The only case where that isn't + * the case is if we're truncating an already unlinked file + * on a wsync fs. In that case, we know the blocks can't + * reappear in the file because the links to file are + * permanently toast. Currently, we're always going to + * want a sync transaction because this code is being + * called from places where nlink is guaranteed to be 1 + * but I'm leaving the tests in to protect against future + * changes -- rcc. + */ + error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0, + XFS_DATA_FORK, + ((ip->i_d.di_nlink != 0 || + !(mp->m_flags & XFS_MOUNT_WSYNC)) + ? 1 : 0)); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + } else { + xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, + NULL); + } + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + return error; +} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h new file mode 100644 index 00000000000..e1ed6a58800 --- /dev/null +++ b/fs/xfs/xfs_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_UTILS_H__ +#define __XFS_UTILS_H__ + +#define IRELE(ip) VN_RELE(XFS_ITOV(ip)) +#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) +#define ITRACE(ip) vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \ + (inst_t *)__return_address) + +extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *); +extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **); +extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *, + xfs_inode_t **); +extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *); +extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, nlink_t, + xfs_dev_t, cred_t *, prid_t, int, + xfs_inode_t **, int *); +extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *); +extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *); +extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *); + +#endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c new file mode 100644 index 00000000000..00aae9c6a90 --- /dev/null +++ b/fs/xfs/xfs_vfsops.c @@ -0,0 +1,1941 @@ +/* + * XFS filesystem operations. + * + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_rw.h" +#include "xfs_refcache.h" +#include "xfs_buf_item.h" +#include "xfs_extfree_item.h" +#include "xfs_quota.h" +#include "xfs_dir2_trace.h" +#include "xfs_acl.h" +#include "xfs_attr.h" +#include "xfs_clnt.h" +#include "xfs_log_priv.h" + +STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); + +int +xfs_init(void) +{ + extern kmem_zone_t *xfs_bmap_free_item_zone; + extern kmem_zone_t *xfs_btree_cur_zone; + extern kmem_zone_t *xfs_trans_zone; + extern kmem_zone_t *xfs_buf_item_zone; + extern kmem_zone_t *xfs_dabuf_zone; +#ifdef XFS_DABUF_DEBUG + extern lock_t xfs_dabuf_global_lock; + spinlock_init(&xfs_dabuf_global_lock, "xfsda"); +#endif + + /* + * Initialize all of the zone allocators we use. + */ + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode"); + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + xfs_da_state_zone = + kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state"); + xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = + kmem_zone_init((sizeof(xfs_buf_log_item_t) + + (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + NBWORD) * sizeof(int))), + "xfs_buf_item"); + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), + "xfs_efd_item"); + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), + "xfs_efi_item"); + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili"); + xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t), + "xfs_chashlist"); + xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); + + /* + * Allocate global trace buffers. + */ +#ifdef XFS_ALLOC_TRACE + xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_BMAP_TRACE + xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_BMBT_TRACE + xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_DIR_TRACE + xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_ATTR_TRACE + xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP); +#endif +#ifdef XFS_DIR2_TRACE + xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP); +#endif + + xfs_dir_startup(); + +#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) + xfs_error_test_init(); +#endif /* DEBUG || INDUCE_IO_ERROR */ + + xfs_init_procfs(); + xfs_sysctl_register(); + return 0; +} + +void +xfs_cleanup(void) +{ + extern kmem_zone_t *xfs_bmap_free_item_zone; + extern kmem_zone_t *xfs_btree_cur_zone; + extern kmem_zone_t *xfs_inode_zone; + extern kmem_zone_t *xfs_trans_zone; + extern kmem_zone_t *xfs_da_state_zone; + extern kmem_zone_t *xfs_dabuf_zone; + extern kmem_zone_t *xfs_efd_zone; + extern kmem_zone_t *xfs_efi_zone; + extern kmem_zone_t *xfs_buf_item_zone; + extern kmem_zone_t *xfs_chashlist_zone; + + xfs_cleanup_procfs(); + xfs_sysctl_unregister(); + xfs_refcache_destroy(); + xfs_acl_zone_destroy(xfs_acl_zone); + +#ifdef XFS_DIR2_TRACE + ktrace_free(xfs_dir2_trace_buf); +#endif +#ifdef XFS_ATTR_TRACE + ktrace_free(xfs_attr_trace_buf); +#endif +#ifdef XFS_DIR_TRACE + ktrace_free(xfs_dir_trace_buf); +#endif +#ifdef XFS_BMBT_TRACE + ktrace_free(xfs_bmbt_trace_buf); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(xfs_bmap_trace_buf); +#endif +#ifdef XFS_ALLOC_TRACE + ktrace_free(xfs_alloc_trace_buf); +#endif + + kmem_cache_destroy(xfs_bmap_free_item_zone); + kmem_cache_destroy(xfs_btree_cur_zone); + kmem_cache_destroy(xfs_inode_zone); + kmem_cache_destroy(xfs_trans_zone); + kmem_cache_destroy(xfs_da_state_zone); + kmem_cache_destroy(xfs_dabuf_zone); + kmem_cache_destroy(xfs_buf_item_zone); + kmem_cache_destroy(xfs_efd_zone); + kmem_cache_destroy(xfs_efi_zone); + kmem_cache_destroy(xfs_ifork_zone); + kmem_cache_destroy(xfs_ili_zone); + kmem_cache_destroy(xfs_chashlist_zone); +} + +/* + * xfs_start_flags + * + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + */ +STATIC int +xfs_start_flags( + struct vfs *vfs, + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + /* Values are in BBs */ + if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = ap->sunit; + mp->m_swidth = ap->swidth; + } + + if (ap->logbufs != -1 && +#if defined(DEBUG) || defined(XLOG_NOLOG) + ap->logbufs != 0 && +#endif + (ap->logbufs < XLOG_MIN_ICLOGS || + ap->logbufs > XLOG_MAX_ICLOGS)) { + cmn_err(CE_WARN, + "XFS: invalid logbufs value: %d [not %d-%d]", + ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + mp->m_logbufs = ap->logbufs; + if (ap->logbufsize != -1 && + ap->logbufsize != 16 * 1024 && + ap->logbufsize != 32 * 1024 && + ap->logbufsize != 64 * 1024 && + ap->logbufsize != 128 * 1024 && + ap->logbufsize != 256 * 1024) { + cmn_err(CE_WARN, + "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + ap->logbufsize); + return XFS_ERROR(EINVAL); + } + mp->m_ihsize = ap->ihashsize; + mp->m_logbsize = ap->logbufsize; + mp->m_fsname_len = strlen(ap->fsname) + 1; + mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP); + strcpy(mp->m_fsname, ap->fsname); + + if (ap->flags & XFSMNT_WSYNC) + mp->m_flags |= XFS_MOUNT_WSYNC; +#if XFS_BIG_INUMS + if (ap->flags & XFSMNT_INO64) { + mp->m_flags |= XFS_MOUNT_INO64; + mp->m_inoadd = XFS_INO64_OFFSET; + } +#endif + if (ap->flags & XFSMNT_NOATIME) + mp->m_flags |= XFS_MOUNT_NOATIME; + + if (ap->flags & XFSMNT_RETERR) + mp->m_flags |= XFS_MOUNT_RETERR; + + if (ap->flags & XFSMNT_NOALIGN) + mp->m_flags |= XFS_MOUNT_NOALIGN; + + if (ap->flags & XFSMNT_SWALLOC) + mp->m_flags |= XFS_MOUNT_SWALLOC; + + if (ap->flags & XFSMNT_OSYNCISOSYNC) + mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; + + if (ap->flags & XFSMNT_32BITINODES) + mp->m_flags |= (XFS_MOUNT_32BITINODES | XFS_MOUNT_32BITINOOPT); + + if (ap->flags & XFSMNT_IOSIZE) { + if (ap->iosizelog > XFS_MAX_IO_LOG || + ap->iosizelog < XFS_MIN_IO_LOG) { + cmn_err(CE_WARN, + "XFS: invalid log iosize: %d [not %d-%d]", + ap->iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; + } + + if (ap->flags & XFSMNT_IHASHSIZE) + mp->m_flags |= XFS_MOUNT_IHASHSIZE; + + if (ap->flags & XFSMNT_IDELETE) + mp->m_flags |= XFS_MOUNT_IDELETE; + + if (ap->flags & XFSMNT_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + + /* + * no recovery flag requires a read-only mount + */ + if (ap->flags & XFSMNT_NORECOVERY) { + if (!(vfs->vfs_flag & VFS_RDONLY)) { + cmn_err(CE_WARN, + "XFS: tried to mount a FS read-write without recovery!"); + return XFS_ERROR(EINVAL); + } + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } + + if (ap->flags & XFSMNT_NOUUID) + mp->m_flags |= XFS_MOUNT_NOUUID; + if (ap->flags & XFSMNT_NOLOGFLUSH) + mp->m_flags |= XFS_MOUNT_NOLOGFLUSH; + + return 0; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct vfs *vfs, + struct xfs_mount_args *ap, + struct xfs_mount *mp) +{ + int ronly = (vfs->vfs_flag & VFS_RDONLY); + + /* Fail a mount where the logbuf is smaller then the log stripe */ + if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { + if ((ap->logbufsize == -1) && + (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (ap->logbufsize < mp->m_sb.sb_logsunit) { + cmn_err(CE_WARN, + "XFS: logbuf size must be greater than or equal to log stripe size"); + return XFS_ERROR(EINVAL); + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { + cmn_err(CE_WARN, + "XFS: logbuf size for version 1 logs must be 16K or 32K"); + return XFS_ERROR(EINVAL); + } + } + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + cmn_err(CE_WARN, + "XFS: cannot mount a read-only filesystem as read-write"); + return XFS_ERROR(EROFS); + } + + /* + * disallow mount attempts with (IRIX) project quota enabled + */ + if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) { + cmn_err(CE_WARN, + "XFS: cannot mount a filesystem with IRIX project quota enabled"); + return XFS_ERROR(ENOSYS); + } + + /* + * check for shared mount. + */ + if (ap->flags & XFSMNT_SHARED) { + if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb)) + return XFS_ERROR(EINVAL); + + /* + * For IRIX 6.5, shared mounts must have the shared + * version bit set, have the persistent readonly + * field set, must be version 0 and can only be mounted + * read-only. + */ + if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || + (mp->m_sb.sb_shared_vn != 0)) + return XFS_ERROR(EINVAL); + + mp->m_flags |= XFS_MOUNT_SHARED; + + /* + * Shared XFS V0 can't deal with DMI. Return EINVAL. + */ + if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) + return XFS_ERROR(EINVAL); + } + + return 0; +} + +/* + * xfs_mount + * + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev. + */ +STATIC int +xfs_mount( + struct bhv_desc *bhvp, + struct xfs_mount_args *args, + cred_t *credp) +{ + struct vfs *vfsp = bhvtovfs(bhvp); + struct bhv_desc *p; + struct xfs_mount *mp = XFS_BHVTOM(bhvp); + struct block_device *ddev, *logdev, *rtdev; + int flags = 0, error; + + ddev = vfsp->vfs_super->s_bdev; + logdev = rtdev = NULL; + + /* + * Setup xfs_mount function vectors from available behaviors + */ + p = vfs_bhv_lookup(vfsp, VFS_POSITION_DM); + mp->m_dm_ops = p ? *(xfs_dmops_t *) vfs_bhv_custom(p) : xfs_dmcore_stub; + p = vfs_bhv_lookup(vfsp, VFS_POSITION_QM); + mp->m_qm_ops = p ? *(xfs_qmops_t *) vfs_bhv_custom(p) : xfs_qmcore_stub; + p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); + mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; + + /* + * Open real time and log devices - order is important. + */ + if (args->logname[0]) { + error = xfs_blkdev_get(mp, args->logname, &logdev); + if (error) + return error; + } + if (args->rtname[0]) { + error = xfs_blkdev_get(mp, args->rtname, &rtdev); + if (error) { + xfs_blkdev_put(logdev); + return error; + } + + if (rtdev == ddev || rtdev == logdev) { + cmn_err(CE_WARN, + "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); + xfs_blkdev_put(logdev); + xfs_blkdev_put(rtdev); + return EINVAL; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + if (!mp->m_ddev_targp) { + xfs_blkdev_put(logdev); + xfs_blkdev_put(rtdev); + return error; + } + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + if (!mp->m_rtdev_targp) + goto error0; + } + mp->m_logdev_targp = (logdev && logdev != ddev) ? + xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp; + if (!mp->m_logdev_targp) + goto error0; + + /* + * Setup flags based on mount(2) options and then the superblock + */ + error = xfs_start_flags(vfsp, args, mp); + if (error) + goto error1; + error = xfs_readsb(mp); + if (error) + goto error1; + error = xfs_finish_flags(vfsp, args, mp); + if (error) + goto error2; + + /* + * Setup xfs_mount buffer target pointers based on superblock + */ + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (!error && logdev && logdev != ddev) { + unsigned int log_sector_size = BBSIZE; + + if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + mp->m_sb.sb_blocksize, + log_sector_size); + } + if (!error && rtdev) + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_blocksize, + mp->m_sb.sb_sectsize); + if (error) + goto error2; + + error = XFS_IOINIT(vfsp, args, flags); + if (!error) + return 0; +error2: + if (mp->m_sb_bp) + xfs_freesb(mp); +error1: + xfs_binval(mp->m_ddev_targp); + if (logdev && logdev != ddev) + xfs_binval(mp->m_logdev_targp); + if (rtdev) + xfs_binval(mp->m_rtdev_targp); +error0: + xfs_unmountfs_close(mp, credp); + return error; +} + +STATIC int +xfs_unmount( + bhv_desc_t *bdp, + int flags, + cred_t *credp) +{ + struct vfs *vfsp = bhvtovfs(bdp); + xfs_mount_t *mp = XFS_BHVTOM(bdp); + xfs_inode_t *rip; + vnode_t *rvp; + int unmount_event_wanted = 0; + int unmount_event_flags = 0; + int xfs_unmountfs_needed = 0; + int error; + + rip = mp->m_rootip; + rvp = XFS_ITOV(rip); + + if (vfsp->vfs_flag & VFS_DMI) { + error = XFS_SEND_PREUNMOUNT(mp, vfsp, + rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL, + NULL, NULL, 0, 0, + (mp->m_dmevmask & (1<m_dmevmask & (1<m_ddev_targp); + error = xfs_unmount_flush(mp, 0); + if (error) + goto out; + + ASSERT(vn_count(rvp) == 1); + + /* + * Drop the reference count + */ + VN_RELE(rvp); + + /* + * If we're forcing a shutdown, typically because of a media error, + * we want to make sure we invalidate dirty pages that belong to + * referenced vnodes as well. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = xfs_sync(&mp->m_bhv, + (SYNC_WAIT | SYNC_CLOSE), credp); + ASSERT(error != EFSCORRUPTED); + } + xfs_unmountfs_needed = 1; + +out: + /* Send DMAPI event, if required. + * Then do xfs_unmountfs() if needed. + * Then return error (or zero). + */ + if (unmount_event_wanted) { + /* Note: mp structure must still exist for + * XFS_SEND_UNMOUNT() call. + */ + XFS_SEND_UNMOUNT(mp, vfsp, error == 0 ? rvp : NULL, + DM_RIGHT_NULL, 0, error, unmount_event_flags); + } + if (xfs_unmountfs_needed) { + /* + * Call common unmount function to flush to disk + * and free the super block buffer & mount structures. + */ + xfs_unmountfs(mp, credp); + } + + return XFS_ERROR(error); +} + +#define REMOUNT_READONLY_FLAGS (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT) + +STATIC int +xfs_mntupdate( + bhv_desc_t *bdp, + int *flags, + struct xfs_mount_args *args) +{ + struct vfs *vfsp = bhvtovfs(bdp); + xfs_mount_t *mp = XFS_BHVTOM(bdp); + int pincount, error; + int count = 0; + + if (args->flags & XFSMNT_NOATIME) + mp->m_flags |= XFS_MOUNT_NOATIME; + else + mp->m_flags &= ~XFS_MOUNT_NOATIME; + + if (!(vfsp->vfs_flag & VFS_RDONLY)) { + VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error); + } + + if (*flags & MS_RDONLY) { + xfs_refcache_purge_mp(mp); + xfs_flush_buftarg(mp->m_ddev_targp, 0); + xfs_finish_reclaim_all(mp, 0); + + /* This loop must run at least twice. + * The first instance of the loop will flush + * most meta data but that will generate more + * meta data (typically directory updates). + * Which then must be flushed and logged before + * we can write the unmount record. + */ + do { + VFS_SYNC(vfsp, REMOUNT_READONLY_FLAGS, NULL, error); + pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (!pincount) { + delay(50); + count++; + } + } while (count < 2); + + /* Ok now write out an unmount record */ + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); + vfsp->vfs_flag |= VFS_RDONLY; + } else { + vfsp->vfs_flag &= ~VFS_RDONLY; + } + + return 0; +} + +/* + * xfs_unmount_flush implements a set of flush operation on special + * inodes, which are needed as a separate set of operations so that + * they can be called as part of relocation process. + */ +int +xfs_unmount_flush( + xfs_mount_t *mp, /* Mount structure we are getting + rid of. */ + int relocation) /* Called from vfs relocation. */ +{ + xfs_inode_t *rip = mp->m_rootip; + xfs_inode_t *rbmip; + xfs_inode_t *rsumip = NULL; + vnode_t *rvp = XFS_ITOV(rip); + int error; + + xfs_ilock(rip, XFS_ILOCK_EXCL); + xfs_iflock(rip); + + /* + * Flush out the real time inodes. + */ + if ((rbmip = mp->m_rbmip) != NULL) { + xfs_ilock(rbmip, XFS_ILOCK_EXCL); + xfs_iflock(rbmip); + error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC); + xfs_iunlock(rbmip, XFS_ILOCK_EXCL); + + if (error == EFSCORRUPTED) + goto fscorrupt_out; + + ASSERT(vn_count(XFS_ITOV(rbmip)) == 1); + + rsumip = mp->m_rsumip; + xfs_ilock(rsumip, XFS_ILOCK_EXCL); + xfs_iflock(rsumip); + error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC); + xfs_iunlock(rsumip, XFS_ILOCK_EXCL); + + if (error == EFSCORRUPTED) + goto fscorrupt_out; + + ASSERT(vn_count(XFS_ITOV(rsumip)) == 1); + } + + /* + * Synchronously flush root inode to disk + */ + error = xfs_iflush(rip, XFS_IFLUSH_SYNC); + if (error == EFSCORRUPTED) + goto fscorrupt_out2; + + if (vn_count(rvp) != 1 && !relocation) { + xfs_iunlock(rip, XFS_ILOCK_EXCL); + return XFS_ERROR(EBUSY); + } + + /* + * Release dquot that rootinode, rbmino and rsumino might be holding, + * flush and purge the quota inodes. + */ + error = XFS_QM_UNMOUNT(mp); + if (error == EFSCORRUPTED) + goto fscorrupt_out2; + + if (rbmip) { + VN_RELE(XFS_ITOV(rbmip)); + VN_RELE(XFS_ITOV(rsumip)); + } + + xfs_iunlock(rip, XFS_ILOCK_EXCL); + return 0; + +fscorrupt_out: + xfs_ifunlock(rip); + +fscorrupt_out2: + xfs_iunlock(rip, XFS_ILOCK_EXCL); + + return XFS_ERROR(EFSCORRUPTED); +} + +/* + * xfs_root extracts the root vnode from a vfs. + * + * vfsp -- the vfs struct for the desired file system + * vpp -- address of the caller's vnode pointer which should be + * set to the desired fs root vnode + */ +STATIC int +xfs_root( + bhv_desc_t *bdp, + vnode_t **vpp) +{ + vnode_t *vp; + + vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip); + VN_HOLD(vp); + *vpp = vp; + return 0; +} + +/* + * xfs_statvfs + * + * Fill in the statvfs structure for the given file system. We use + * the superblock lock in the mount structure to ensure a consistent + * snapshot of the counters returned. + */ +STATIC int +xfs_statvfs( + bhv_desc_t *bdp, + xfs_statfs_t *statp, + vnode_t *vp) +{ + __uint64_t fakeinos; + xfs_extlen_t lsize; + xfs_mount_t *mp; + xfs_sb_t *sbp; + unsigned long s; + u64 id; + + mp = XFS_BHVTOM(bdp); + sbp = &(mp->m_sb); + + statp->f_type = XFS_SB_MAGIC; + + s = XFS_SB_LOCK(mp); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks; + fakeinos = statp->f_bfree << sbp->sb_inopblog; +#if XFS_BIG_INUMS + fakeinos += mp->m_inoadd; +#endif + statp->f_files = + MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) +#if XFS_BIG_INUMS + if (!mp->m_inoadd) +#endif + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + XFS_SB_UNLOCK(mp, s); + + id = huge_encode_dev(mp->m_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + statp->f_namelen = MAXNAMELEN - 1; + + return 0; +} + + +/* + * xfs_sync flushes any pending I/O to file system vfsp. + * + * This routine is called by vfs_sync() to make sure that things make it + * out to disk eventually, on sync() system calls to flush out everything, + * and when the file system is unmounted. For the vfs_sync() case, all + * we really need to do is sync out the log to make all of our meta-data + * updates permanent (except for timestamps). For calls from pflushd(), + * dirty pages are kept moving by calling pdflush() on the inodes + * containing them. We also flush the inodes that we can lock without + * sleeping and the superblock if we can lock it without sleeping from + * vfs_sync() so that items at the tail of the log are always moving out. + * + * Flags: + * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want + * to sleep if we can help it. All we really need + * to do is ensure that the log is synced at least + * periodically. We also push the inodes and + * superblock if we can lock them without sleeping + * and they are not pinned. + * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not + * set, then we really want to lock each inode and flush + * it. + * SYNC_WAIT - All the flushes that take place in this call should + * be synchronous. + * SYNC_DELWRI - This tells us to push dirty pages associated with + * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to + * determine if they should be flushed sync, async, or + * delwri. + * SYNC_CLOSE - This flag is passed when the system is being + * unmounted. We should sync and invalidate everthing. + * SYNC_FSDATA - This indicates that the caller would like to make + * sure the superblock is safe on disk. We can ensure + * this by simply makeing sure the log gets flushed + * if SYNC_BDFLUSH is set, and by actually writing it + * out otherwise. + * + */ +/*ARGSUSED*/ +STATIC int +xfs_sync( + bhv_desc_t *bdp, + int flags, + cred_t *credp) +{ + xfs_mount_t *mp; + + mp = XFS_BHVTOM(bdp); + return (xfs_syncsub(mp, flags, 0, NULL)); +} + +/* + * xfs sync routine for internal use + * + * This routine supports all of the flags defined for the generic VFS_SYNC + * interface as explained above under xfs_sync. In the interests of not + * changing interfaces within the 6.5 family, additional internallly- + * required functions are specified within a separate xflags parameter, + * only available by calling this routine. + * + */ +STATIC int +xfs_sync_inodes( + xfs_mount_t *mp, + int flags, + int xflags, + int *bypassed) +{ + xfs_inode_t *ip = NULL; + xfs_inode_t *ip_next; + xfs_buf_t *bp; + vnode_t *vp = NULL; + vmap_t vmap; + int error; + int last_error; + uint64_t fflag; + uint lock_flags; + uint base_lock_flags; + boolean_t mount_locked; + boolean_t vnode_refed; + int preempt; + xfs_dinode_t *dip; + xfs_iptr_t *ipointer; +#ifdef DEBUG + boolean_t ipointer_in = B_FALSE; + +#define IPOINTER_SET ipointer_in = B_TRUE +#define IPOINTER_CLR ipointer_in = B_FALSE +#else +#define IPOINTER_SET +#define IPOINTER_CLR +#endif + + +/* Insert a marker record into the inode list after inode ip. The list + * must be locked when this is called. After the call the list will no + * longer be locked. + */ +#define IPOINTER_INSERT(ip, mp) { \ + ASSERT(ipointer_in == B_FALSE); \ + ipointer->ip_mnext = ip->i_mnext; \ + ipointer->ip_mprev = ip; \ + ip->i_mnext = (xfs_inode_t *)ipointer; \ + ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \ + preempt = 0; \ + XFS_MOUNT_IUNLOCK(mp); \ + mount_locked = B_FALSE; \ + IPOINTER_SET; \ + } + +/* Remove the marker from the inode list. If the marker was the only item + * in the list then there are no remaining inodes and we should zero out + * the whole list. If we are the current head of the list then move the head + * past us. + */ +#define IPOINTER_REMOVE(ip, mp) { \ + ASSERT(ipointer_in == B_TRUE); \ + if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \ + ip = ipointer->ip_mnext; \ + ip->i_mprev = ipointer->ip_mprev; \ + ipointer->ip_mprev->i_mnext = ip; \ + if (mp->m_inodes == (xfs_inode_t *)ipointer) { \ + mp->m_inodes = ip; \ + } \ + } else { \ + ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \ + mp->m_inodes = NULL; \ + ip = NULL; \ + } \ + IPOINTER_CLR; \ + } + +#define XFS_PREEMPT_MASK 0x7f + + if (bypassed) + *bypassed = 0; + if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + return 0; + error = 0; + last_error = 0; + preempt = 0; + + /* Allocate a reference marker */ + ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); + + fflag = XFS_B_ASYNC; /* default is don't wait */ + if (flags & SYNC_BDFLUSH) + fflag = XFS_B_DELWRI; + if (flags & SYNC_WAIT) + fflag = 0; /* synchronous overrides all */ + + base_lock_flags = XFS_ILOCK_SHARED; + if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { + /* + * We need the I/O lock if we're going to call any of + * the flush/inval routines. + */ + base_lock_flags |= XFS_IOLOCK_SHARED; + } + + XFS_MOUNT_ILOCK(mp); + + ip = mp->m_inodes; + + mount_locked = B_TRUE; + vnode_refed = B_FALSE; + + IPOINTER_CLR; + + do { + ASSERT(ipointer_in == B_FALSE); + ASSERT(vnode_refed == B_FALSE); + + lock_flags = base_lock_flags; + + /* + * There were no inodes in the list, just break out + * of the loop. + */ + if (ip == NULL) { + break; + } + + /* + * We found another sync thread marker - skip it + */ + if (ip->i_mount == NULL) { + ip = ip->i_mnext; + continue; + } + + vp = XFS_ITOV_NULL(ip); + + /* + * If the vnode is gone then this is being torn down, + * call reclaim if it is flushed, else let regular flush + * code deal with it later in the loop. + */ + + if (vp == NULL) { + /* Skip ones already in reclaim */ + if (ip->i_flags & XFS_IRECLAIM) { + ip = ip->i_mnext; + continue; + } + if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { + ip = ip->i_mnext; + } else if ((xfs_ipincount(ip) == 0) && + xfs_iflock_nowait(ip)) { + IPOINTER_INSERT(ip, mp); + + xfs_finish_reclaim(ip, 1, + XFS_IFLUSH_DELWRI_ELSE_ASYNC); + + XFS_MOUNT_ILOCK(mp); + mount_locked = B_TRUE; + IPOINTER_REMOVE(ip, mp); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ip = ip->i_mnext; + } + continue; + } + + if (VN_BAD(vp)) { + ip = ip->i_mnext; + continue; + } + + if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { + XFS_MOUNT_IUNLOCK(mp); + kmem_free(ipointer, sizeof(xfs_iptr_t)); + return 0; + } + + /* + * If this is just vfs_sync() or pflushd() calling + * then we can skip inodes for which it looks like + * there is nothing to do. Since we don't have the + * inode locked this is racey, but these are periodic + * calls so it doesn't matter. For the others we want + * to know for sure, so we at least try to lock them. + */ + if (flags & SYNC_BDFLUSH) { + if (((ip->i_itemp == NULL) || + !(ip->i_itemp->ili_format.ilf_fields & + XFS_ILOG_ALL)) && + (ip->i_update_core == 0)) { + ip = ip->i_mnext; + continue; + } + } + + /* + * Try to lock without sleeping. We're out of order with + * the inode list lock here, so if we fail we need to drop + * the mount lock and try again. If we're called from + * bdflush() here, then don't bother. + * + * The inode lock here actually coordinates with the + * almost spurious inode lock in xfs_ireclaim() to prevent + * the vnode we handle here without a reference from + * being freed while we reference it. If we lock the inode + * while it's on the mount list here, then the spurious inode + * lock in xfs_ireclaim() after the inode is pulled from + * the mount list will sleep until we release it here. + * This keeps the vnode from being freed while we reference + * it. It is also cheaper and simpler than actually doing + * a vn_get() for every inode we touch here. + */ + if (xfs_ilock_nowait(ip, lock_flags) == 0) { + + if ((flags & SYNC_BDFLUSH) || (vp == NULL)) { + ip = ip->i_mnext; + continue; + } + + /* + * We need to unlock the inode list lock in order + * to lock the inode. Insert a marker record into + * the inode list to remember our position, dropping + * the lock is now done inside the IPOINTER_INSERT + * macro. + * + * We also use the inode list lock to protect us + * in taking a snapshot of the vnode version number + * for use in calling vn_get(). + */ + VMAP(vp, vmap); + IPOINTER_INSERT(ip, mp); + + vp = vn_get(vp, &vmap); + if (vp == NULL) { + /* + * The vnode was reclaimed once we let go + * of the inode list lock. Skip to the + * next list entry. Remove the marker. + */ + + XFS_MOUNT_ILOCK(mp); + + mount_locked = B_TRUE; + vnode_refed = B_FALSE; + + IPOINTER_REMOVE(ip, mp); + + continue; + } + + xfs_ilock(ip, lock_flags); + + ASSERT(vp == XFS_ITOV(ip)); + ASSERT(ip->i_mount == mp); + + vnode_refed = B_TRUE; + } + + /* From here on in the loop we may have a marker record + * in the inode list. + */ + + if ((flags & SYNC_CLOSE) && (vp != NULL)) { + /* + * This is the shutdown case. We just need to + * flush and invalidate all the pages associated + * with the inode. Drop the inode lock since + * we can't hold it across calls to the buffer + * cache. + * + * We don't set the VREMAPPING bit in the vnode + * here, because we don't hold the vnode lock + * exclusively. It doesn't really matter, though, + * because we only come here when we're shutting + * down anyway. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (XFS_FORCED_SHUTDOWN(mp)) { + VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF); + } else { + VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF); + } + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { + if (VN_DIRTY(vp)) { + /* We need to have dropped the lock here, + * so insert a marker if we have not already + * done so. + */ + if (mount_locked) { + IPOINTER_INSERT(ip, mp); + } + + /* + * Drop the inode lock since we can't hold it + * across calls to the buffer cache. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, + fflag, FI_NONE, error); + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + + } + + if (flags & SYNC_BDFLUSH) { + if ((flags & SYNC_ATTR) && + ((ip->i_update_core) || + ((ip->i_itemp != NULL) && + (ip->i_itemp->ili_format.ilf_fields != 0)))) { + + /* Insert marker and drop lock if not already + * done. + */ + if (mount_locked) { + IPOINTER_INSERT(ip, mp); + } + + /* + * We don't want the periodic flushing of the + * inodes by vfs_sync() to interfere with + * I/O to the file, especially read I/O + * where it is only the access time stamp + * that is being flushed out. To prevent + * long periods where we have both inode + * locks held shared here while reading the + * inode's buffer in from disk, we drop the + * inode lock while reading in the inode + * buffer. We have to release the buffer + * and reacquire the inode lock so that they + * are acquired in the proper order (inode + * locks first). The buffer will go at the + * end of the lru chain, though, so we can + * expect it to still be there when we go + * for it again in xfs_iflush(). + */ + if ((xfs_ipincount(ip) == 0) && + xfs_iflock_nowait(ip)) { + + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + error = xfs_itobp(mp, NULL, ip, + &dip, &bp, 0); + if (!error) { + xfs_buf_relse(bp); + } else { + /* Bailing out, remove the + * marker and free it. + */ + XFS_MOUNT_ILOCK(mp); + + IPOINTER_REMOVE(ip, mp); + + XFS_MOUNT_IUNLOCK(mp); + + ASSERT(!(lock_flags & + XFS_IOLOCK_SHARED)); + + kmem_free(ipointer, + sizeof(xfs_iptr_t)); + return (0); + } + + /* + * Since we dropped the inode lock, + * the inode may have been reclaimed. + * Therefore, we reacquire the mount + * lock and check to see if we were the + * inode reclaimed. If this happened + * then the ipointer marker will no + * longer point back at us. In this + * case, move ip along to the inode + * after the marker, remove the marker + * and continue. + */ + XFS_MOUNT_ILOCK(mp); + mount_locked = B_TRUE; + + if (ip != ipointer->ip_mprev) { + IPOINTER_REMOVE(ip, mp); + + ASSERT(!vnode_refed); + ASSERT(!(lock_flags & + XFS_IOLOCK_SHARED)); + continue; + } + + ASSERT(ip->i_mount == mp); + + if (xfs_ilock_nowait(ip, + XFS_ILOCK_SHARED) == 0) { + ASSERT(ip->i_mount == mp); + /* + * We failed to reacquire + * the inode lock without + * sleeping, so just skip + * the inode for now. We + * clear the ILOCK bit from + * the lock_flags so that we + * won't try to drop a lock + * we don't hold below. + */ + lock_flags &= ~XFS_ILOCK_SHARED; + IPOINTER_REMOVE(ip_next, mp); + } else if ((xfs_ipincount(ip) == 0) && + xfs_iflock_nowait(ip)) { + ASSERT(ip->i_mount == mp); + /* + * Since this is vfs_sync() + * calling we only flush the + * inode out if we can lock + * it without sleeping and + * it is not pinned. Drop + * the mount lock here so + * that we don't hold it for + * too long. We already have + * a marker in the list here. + */ + XFS_MOUNT_IUNLOCK(mp); + mount_locked = B_FALSE; + error = xfs_iflush(ip, + XFS_IFLUSH_DELWRI); + } else { + ASSERT(ip->i_mount == mp); + IPOINTER_REMOVE(ip_next, mp); + } + } + + } + + } else { + if ((flags & SYNC_ATTR) && + ((ip->i_update_core) || + ((ip->i_itemp != NULL) && + (ip->i_itemp->ili_format.ilf_fields != 0)))) { + if (mount_locked) { + IPOINTER_INSERT(ip, mp); + } + + if (flags & SYNC_WAIT) { + xfs_iflock(ip); + error = xfs_iflush(ip, + XFS_IFLUSH_SYNC); + } else { + /* + * If we can't acquire the flush + * lock, then the inode is already + * being flushed so don't bother + * waiting. If we can lock it then + * do a delwri flush so we can + * combine multiple inode flushes + * in each disk write. + */ + if (xfs_iflock_nowait(ip)) { + error = xfs_iflush(ip, + XFS_IFLUSH_DELWRI); + } + else if (bypassed) + (*bypassed)++; + } + } + } + + if (lock_flags != 0) { + xfs_iunlock(ip, lock_flags); + } + + if (vnode_refed) { + /* + * If we had to take a reference on the vnode + * above, then wait until after we've unlocked + * the inode to release the reference. This is + * because we can be already holding the inode + * lock when VN_RELE() calls xfs_inactive(). + * + * Make sure to drop the mount lock before calling + * VN_RELE() so that we don't trip over ourselves if + * we have to go for the mount lock again in the + * inactive code. + */ + if (mount_locked) { + IPOINTER_INSERT(ip, mp); + } + + VN_RELE(vp); + + vnode_refed = B_FALSE; + } + + if (error) { + last_error = error; + } + + /* + * bail out if the filesystem is corrupted. + */ + if (error == EFSCORRUPTED) { + if (!mount_locked) { + XFS_MOUNT_ILOCK(mp); + IPOINTER_REMOVE(ip, mp); + } + XFS_MOUNT_IUNLOCK(mp); + ASSERT(ipointer_in == B_FALSE); + kmem_free(ipointer, sizeof(xfs_iptr_t)); + return XFS_ERROR(error); + } + + /* Let other threads have a chance at the mount lock + * if we have looped many times without dropping the + * lock. + */ + if ((++preempt & XFS_PREEMPT_MASK) == 0) { + if (mount_locked) { + IPOINTER_INSERT(ip, mp); + } + } + + if (mount_locked == B_FALSE) { + XFS_MOUNT_ILOCK(mp); + mount_locked = B_TRUE; + IPOINTER_REMOVE(ip, mp); + continue; + } + + ASSERT(ipointer_in == B_FALSE); + ip = ip->i_mnext; + + } while (ip != mp->m_inodes); + + XFS_MOUNT_IUNLOCK(mp); + + ASSERT(ipointer_in == B_FALSE); + + kmem_free(ipointer, sizeof(xfs_iptr_t)); + return XFS_ERROR(last_error); +} + +/* + * xfs sync routine for internal use + * + * This routine supports all of the flags defined for the generic VFS_SYNC + * interface as explained above under xfs_sync. In the interests of not + * changing interfaces within the 6.5 family, additional internallly- + * required functions are specified within a separate xflags parameter, + * only available by calling this routine. + * + */ +int +xfs_syncsub( + xfs_mount_t *mp, + int flags, + int xflags, + int *bypassed) +{ + int error = 0; + int last_error = 0; + uint log_flags = XFS_LOG_FORCE; + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + /* + * Sync out the log. This ensures that the log is periodically + * flushed even if there is not enough activity to fill it up. + */ + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; + + xfs_log_force(mp, (xfs_lsn_t)0, log_flags); + + if (flags & (SYNC_ATTR|SYNC_DELWRI)) { + if (flags & SYNC_BDFLUSH) + xfs_finish_reclaim_all(mp, 1); + else + error = xfs_sync_inodes(mp, flags, xflags, bypassed); + } + + /* + * Flushing out dirty data above probably generated more + * log activity, so if this isn't vfs_sync() then flush + * the log again. + */ + if (flags & SYNC_DELWRI) { + xfs_log_force(mp, (xfs_lsn_t)0, log_flags); + } + + if (flags & SYNC_FSDATA) { + /* + * If this is vfs_sync() then only sync the superblock + * if we can lock it without sleeping and it is not pinned. + */ + if (flags & SYNC_BDFLUSH) { + bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); + if (bp != NULL) { + bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); + if ((bip != NULL) && + xfs_buf_item_dirty(bip)) { + if (!(XFS_BUF_ISPINNED(bp))) { + XFS_BUF_ASYNC(bp); + error = xfs_bwrite(mp, bp); + } else { + xfs_buf_relse(bp); + } + } else { + xfs_buf_relse(bp); + } + } + } else { + bp = xfs_getsb(mp, 0); + /* + * If the buffer is pinned then push on the log so + * we won't get stuck waiting in the write for + * someone, maybe ourselves, to flush the log. + * Even though we just pushed the log above, we + * did not have the superblock buffer locked at + * that point so it can become pinned in between + * there and here. + */ + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + if (flags & SYNC_WAIT) + XFS_BUF_UNASYNC(bp); + else + XFS_BUF_ASYNC(bp); + error = xfs_bwrite(mp, bp); + } + if (error) { + last_error = error; + } + } + + /* + * If this is the periodic sync, then kick some entries out of + * the reference cache. This ensures that idle entries are + * eventually kicked out of the cache. + */ + if (flags & SYNC_REFCACHE) { + xfs_refcache_purge_some(mp); + } + + /* + * Now check to see if the log needs a "dummy" transaction. + */ + + if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { + xfs_trans_t *tp; + xfs_inode_t *ip; + + /* + * Put a dummy transaction in the log to tell + * recovery that all others are OK. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + if ((error = xfs_trans_reserve(tp, 0, + XFS_ICHANGE_LOG_RES(mp), + 0, 0, 0))) { + xfs_trans_cancel(tp, 0); + return error; + } + + ip = mp->m_rootip; + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_log_force(mp, (xfs_lsn_t)0, log_flags); + } + + /* + * When shutting down, we need to insure that the AIL is pushed + * to disk or the filesystem can appear corrupt from the PROM. + */ + if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) { + XFS_bflush(mp->m_ddev_targp); + if (mp->m_rtdev_targp) { + XFS_bflush(mp->m_rtdev_targp); + } + } + + return XFS_ERROR(last_error); +} + +/* + * xfs_vget - called by DMAPI and NFSD to get vnode from file handle + */ +STATIC int +xfs_vget( + bhv_desc_t *bdp, + vnode_t **vpp, + fid_t *fidp) +{ + xfs_mount_t *mp = XFS_BHVTOM(bdp); + xfs_fid_t *xfid = (struct xfs_fid *)fidp; + xfs_inode_t *ip; + int error; + xfs_ino_t ino; + unsigned int igen; + + /* + * Invalid. Since handles can be created in user space and passed in + * via gethandle(), this is not cause for a panic. + */ + if (xfid->xfs_fid_len != sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) + return XFS_ERROR(EINVAL); + + ino = xfid->xfs_fid_ino; + igen = xfid->xfs_fid_gen; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return XFS_ERROR(ESTALE); + + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); + if (error) { + *vpp = NULL; + return error; + } + + if (ip == NULL) { + *vpp = NULL; + return XFS_ERROR(EIO); + } + + if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + *vpp = NULL; + return XFS_ERROR(ENOENT); + } + + *vpp = XFS_ITOV(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return 0; +} + + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_NOLOGFLUSH "nologflush" /* don't hard flush on log writes */ +#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ + + +int +xfs_parseargs( + struct bhv_desc *bhv, + char *options, + struct xfs_mount_args *args, + int update) +{ + struct vfs *vfsp = bhvtovfs(bhv); + char *this_char, *value, *eov; + int dsunit, dswidth, vol_dsunit, vol_dswidth; + int iosize; + +#if 0 /* XXX: off by default, until some remaining issues ironed out */ + args->flags |= XFSMNT_IDELETE; /* default to on */ +#endif + + if (!options) + return 0; + + iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_LOGBUFS); + return EINVAL; + } + args->logbufs = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + int last, in_kilobytes = 0; + + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_LOGBSIZE); + return EINVAL; + } + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + in_kilobytes = 1; + value[last] = '\0'; + } + args->logbufsize = simple_strtoul(value, &eov, 10); + if (in_kilobytes) + args->logbufsize <<= 10; + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_LOGDEV); + return EINVAL; + } + strncpy(args->logname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_MTPT); + return EINVAL; + } + strncpy(args->mtpt, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_RTDEV); + return EINVAL; + } + strncpy(args->rtname, value, MAXNAMELEN); + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_BIOSIZE); + return EINVAL; + } + iosize = simple_strtoul(value, &eov, 10); + args->flags |= XFSMNT_IOSIZE; + args->iosizelog = (uint8_t) iosize; + } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + this_char); + return EINVAL; + } + args->flags |= XFSMNT_IHASHSIZE; + args->ihashsize = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + args->flags |= XFSMNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { + args->flags |= XFSMNT_OSYNCISOSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + args->flags |= XFSMNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_INO64)) { + args->flags |= XFSMNT_INO64; +#if !XFS_BIG_INUMS + printk("XFS: %s option not allowed on this system\n", + MNTOPT_INO64); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + args->flags |= XFSMNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + args->flags |= XFSMNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_SUNIT); + return EINVAL; + } + dsunit = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + printk("XFS: %s option requires an argument\n", + MNTOPT_SWIDTH); + return EINVAL; + } + dswidth = simple_strtoul(value, &eov, 10); + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + args->flags &= ~XFSMNT_32BITINODES; +#if !XFS_BIG_INUMS + printk("XFS: %s option not allowed on this system\n", + MNTOPT_64BITINODE); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + args->flags |= XFSMNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) { + args->flags |= XFSMNT_NOLOGFLUSH; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + args->flags &= ~XFSMNT_IDELETE; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + args->flags |= XFSMNT_IDELETE; + } else if (!strcmp(this_char, "osyncisdsync")) { + /* no-op, this is now the default */ +printk("XFS: osyncisdsync is now the default, option is deprecated.\n"); + } else if (!strcmp(this_char, "irixsgid")) { +printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n"); + } else { + printk("XFS: unknown mount option [%s].\n", this_char); + return EINVAL; + } + } + + if (args->flags & XFSMNT_NORECOVERY) { + if ((vfsp->vfs_flag & VFS_RDONLY) == 0) { + printk("XFS: no-recovery mounts must be read-only.\n"); + return EINVAL; + } + } + + if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { + printk( + "XFS: sunit and swidth options incompatible with the noalign option\n"); + return EINVAL; + } + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + printk("XFS: sunit and swidth must be specified together\n"); + return EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + printk( + "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n", + dswidth, dsunit); + return EINVAL; + } + + if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { + if (dsunit) { + args->sunit = dsunit; + args->flags |= XFSMNT_RETERR; + } else { + args->sunit = vol_dsunit; + } + dswidth ? (args->swidth = dswidth) : + (args->swidth = vol_dswidth); + } else { + args->sunit = args->swidth = 0; + } + + return 0; +} + +int +xfs_showargs( + struct bhv_desc *bhv, + struct seq_file *m) +{ + static struct proc_xfs_info { + int flag; + char *str; + } xfs_info[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_INO64, "," MNTOPT_INO64 }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, + { XFS_MOUNT_NOLOGFLUSH, "," MNTOPT_NOLOGFLUSH }, + { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + struct xfs_mount *mp = XFS_BHVTOM(bhv); + + for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_IHASHSIZE) + seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize); + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_BIOSIZE "=%d", mp->m_writeio_log); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%d", mp->m_logbsize); + + if (mp->m_ddev_targp != mp->m_logdev_targp) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", + XFS_BUFTARG_NAME(mp->m_logdev_targp)); + + if (mp->m_rtdev_targp && mp->m_ddev_targp != mp->m_rtdev_targp) + seq_printf(m, "," MNTOPT_RTDEV "=%s", + XFS_BUFTARG_NAME(mp->m_rtdev_targp)); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT)) + seq_printf(m, "," MNTOPT_64BITINODE); + + return 0; +} + +STATIC void +xfs_freeze( + bhv_desc_t *bdp) +{ + xfs_mount_t *mp = XFS_BHVTOM(bdp); + + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* Push the superblock and write an unmount record */ + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + + +vfsops_t xfs_vfsops = { + BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS), + .vfs_parseargs = xfs_parseargs, + .vfs_showargs = xfs_showargs, + .vfs_mount = xfs_mount, + .vfs_unmount = xfs_unmount, + .vfs_mntupdate = xfs_mntupdate, + .vfs_root = xfs_root, + .vfs_statvfs = xfs_statvfs, + .vfs_sync = xfs_sync, + .vfs_vget = xfs_vget, + .vfs_dmapiops = (vfs_dmapiops_t)fs_nosys, + .vfs_quotactl = (vfs_quotactl_t)fs_nosys, + .vfs_init_vnode = xfs_initialize_vnode, + .vfs_force_shutdown = xfs_do_force_shutdown, + .vfs_freeze = xfs_freeze, +}; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c new file mode 100644 index 00000000000..70092963ca9 --- /dev/null +++ b/fs/xfs/xfs_vnodeops.c @@ -0,0 +1,4712 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_macros.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode_item.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_rw.h" +#include "xfs_refcache.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_dir_leaf.h" +#include "xfs_mac.h" +#include "xfs_log_priv.h" + + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 2 extents back from + * bmapi. + */ +#define SYMLINK_MAPS 2 + +/* + * For xfs, we check that the file isn't too big to be opened by this kernel. + * No other open action is required for regular files. Devices are handled + * through the specfs file system, pipes through fifofs. Device and + * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively, + * when a new vnode is first looked up or created. + */ +STATIC int +xfs_open( + bhv_desc_t *bdp, + cred_t *credp) +{ + int mode; + vnode_t *vp; + xfs_inode_t *ip; + + vp = BHV_TO_VNODE(bdp); + ip = XFS_BHVTOI(bdp); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return XFS_ERROR(EIO); + + /* + * If it's a directory with any blocks, read-ahead block 0 + * as we're almost certain to have the next operation be a read there. + */ + if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) { + mode = xfs_ilock_map_shared(ip); + if (ip->i_d.di_nextents > 0) + (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_iunlock(ip, mode); + } + return 0; +} + + +/* + * xfs_getattr + */ +STATIC int +xfs_getattr( + bhv_desc_t *bdp, + vattr_t *vap, + int flags, + cred_t *credp) +{ + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + + vp = BHV_TO_VNODE(bdp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + if (!(flags & ATTR_LAZY)) + xfs_ilock(ip, XFS_ILOCK_SHARED); + + vap->va_size = ip->i_d.di_size; + if (vap->va_mask == XFS_AT_SIZE) + goto all_done; + + vap->va_nblocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + vap->va_nodeid = ip->i_ino; +#if XFS_BIG_INUMS + vap->va_nodeid += mp->m_inoadd; +#endif + vap->va_nlink = ip->i_d.di_nlink; + + /* + * Quick exit for non-stat callers + */ + if ((vap->va_mask & + ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID| + XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0) + goto all_done; + + /* + * Copy from in-core inode. + */ + vap->va_type = vp->v_type; + vap->va_mode = ip->i_d.di_mode & MODEMASK; + vap->va_uid = ip->i_d.di_uid; + vap->va_gid = ip->i_d.di_gid; + vap->va_projid = ip->i_d.di_projid; + + /* + * Check vnode type block/char vs. everything else. + * Do it with bitmask because that's faster than looking + * for multiple values individually. + */ + if (((1 << vp->v_type) & ((1<va_rdev = 0; + + if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + +#if 0 + /* Large block sizes confuse various + * user space programs, so letting the + * stripe size through is not a good + * idea for now. + */ + vap->va_blocksize = mp->m_swidth ? + /* + * If the underlying volume is a stripe, then + * return the stripe width in bytes as the + * recommended I/O size. + */ + (mp->m_swidth << mp->m_sb.sb_blocklog) : + /* + * Return the largest of the preferred buffer + * sizes since doing small I/Os into larger + * buffers causes buffers to be decommissioned. + * The value returned is in bytes. + */ + (1 << (int)MAX(mp->m_readio_log, + mp->m_writeio_log)); + +#else + vap->va_blocksize = + /* + * Return the largest of the preferred buffer + * sizes since doing small I/Os into larger + * buffers causes buffers to be decommissioned. + * The value returned is in bytes. + */ + 1 << (int)MAX(mp->m_readio_log, + mp->m_writeio_log); +#endif + } else { + + /* + * If the file blocks are being allocated from a + * realtime partition, then return the inode's + * realtime extent size or the realtime volume's + * extent size. + */ + vap->va_blocksize = ip->i_d.di_extsize ? + (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) : + (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog); + } + } else { + vap->va_rdev = ip->i_df.if_u2.if_rdev; + vap->va_blocksize = BLKDEV_IOSIZE; + } + + vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec; + vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + + /* + * Exit for stat callers. See if any of the rest of the fields + * to be filled in are needed. + */ + if ((vap->va_mask & + (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| + XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) + goto all_done; + + /* + * Convert di_flags to xflags. + */ + vap->va_xflags = xfs_ip2xflags(ip); + + /* + * Exit for inode revalidate. See if any of the rest of + * the fields to be filled in are needed. + */ + if ((vap->va_mask & + (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| + XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) + goto all_done; + + vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog; + vap->va_nextents = + (ip->i_df.if_flags & XFS_IFEXTENTS) ? + ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) : + ip->i_d.di_nextents; + if (ip->i_afp) + vap->va_anextents = + (ip->i_afp->if_flags & XFS_IFEXTENTS) ? + ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) : + ip->i_d.di_anextents; + else + vap->va_anextents = 0; + vap->va_gen = ip->i_d.di_gen; + + all_done: + if (!(flags & ATTR_LAZY)) + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return 0; +} + + +/* + * xfs_setattr + */ +int +xfs_setattr( + bhv_desc_t *bdp, + vattr_t *vap, + int flags, + cred_t *credp) +{ + xfs_inode_t *ip; + xfs_trans_t *tp; + xfs_mount_t *mp; + int mask; + int code; + uint lock_flags; + uint commit_flags=0; + uid_t uid=0, iuid=0; + gid_t gid=0, igid=0; + int timeflags = 0; + vnode_t *vp; + xfs_prid_t projid=0, iprojid=0; + int mandlock_before, mandlock_after; + struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; + int file_owner; + int need_iolock = (flags & ATTR_DMI) == 0; + + vp = BHV_TO_VNODE(bdp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + + /* + * Cannot set certain attributes. + */ + mask = vap->va_mask; + if (mask & XFS_AT_NOSET) { + return XFS_ERROR(EINVAL); + } + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Timestamps do not need to be logged and hence do not + * need to be done within a transaction. + */ + if (mask & XFS_AT_UPDTIMES) { + ASSERT((mask & ~XFS_AT_UPDTIMES) == 0); + timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) | + ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) | + ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0); + xfs_ichgtime(ip, timeflags); + return 0; + } + + olddquot1 = olddquot2 = NULL; + udqp = gdqp = NULL; + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) { + uint qflags = 0; + + if (mask & XFS_AT_UID) { + uid = vap->va_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = ip->i_d.di_uid; + } + if (mask & XFS_AT_GID) { + gid = vap->va_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = ip->i_d.di_gid; + } + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp); + if (code) + return (code); + } + + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = NULL; + lock_flags = XFS_ILOCK_EXCL; + if (!(mask & XFS_AT_SIZE)) { + if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) || + (mp->m_flags & XFS_MOUNT_WSYNC)) { + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + commit_flags = 0; + if ((code = xfs_trans_reserve(tp, 0, + XFS_ICHANGE_LOG_RES(mp), 0, + 0, 0))) { + lock_flags = 0; + goto error_return; + } + } + } else { + if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) && + !(flags & ATTR_DMI)) { + int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; + code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp, + vap->va_size, 0, dmflags, NULL); + if (code) { + lock_flags = 0; + goto error_return; + } + } + if (need_iolock) + lock_flags |= XFS_IOLOCK_EXCL; + } + + xfs_ilock(ip, lock_flags); + + /* boolean: are we the file owner? */ + file_owner = (current_fsuid(credp) == ip->i_d.di_uid); + + /* + * Change various properties of a file. + * Only the owner or users with CAP_FOWNER + * capability may do these things. + */ + if (mask & + (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID| + XFS_AT_GID|XFS_AT_PROJID)) { + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal + * to the file owner ID, except in cases where the + * CAP_FSETID capability is applicable. + */ + if (!file_owner && !capable(CAP_FOWNER)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * CAP_FSETID overrides the following restrictions: + * + * The effective user ID of the calling process shall match + * the file owner when setting the set-user-ID and + * set-group-ID bits on that file. + * + * The effective group ID or one of the supplementary group + * IDs of the calling process shall match the group owner of + * the file when setting the set-group-ID bit on that file + */ + if (mask & XFS_AT_MODE) { + mode_t m = 0; + + if ((vap->va_mode & S_ISUID) && !file_owner) + m |= S_ISUID; + if ((vap->va_mode & S_ISGID) && + !in_group_p((gid_t)ip->i_d.di_gid)) + m |= S_ISGID; +#if 0 + /* Linux allows this, Irix doesn't. */ + if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR) + m |= S_ISVTX; +#endif + if (m && !capable(CAP_FSETID)) + vap->va_mode &= ~m; + } + } + + /* + * Change file ownership. Must be the owner or privileged. + * If the system was configured with the "restricted_chown" + * option, the owner is not permitted to give away the file, + * and can change the group id only to a group of which he + * or she is a member. + */ + if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = ip->i_d.di_uid; + iprojid = ip->i_d.di_projid; + igid = ip->i_d.di_gid; + gid = (mask & XFS_AT_GID) ? vap->va_gid : igid; + uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid; + projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid : + iprojid; + + /* + * CAP_CHOWN overrides the following restrictions: + * + * If _POSIX_CHOWN_RESTRICTED is defined, this capability + * shall override the restriction that a process cannot + * change the user ID of a file it owns and the restriction + * that the group ID supplied to the chown() function + * shall be equal to either the group ID or one of the + * supplementary group IDs of the calling process. + * + * XXX: How does restricted_chown affect projid? + */ + if (restricted_chown && + (iuid != uid || (igid != gid && + !in_group_p((gid_t)gid))) && + !capable(CAP_CHOWN)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + /* + * Do a quota reservation only if uid or gid is actually + * going to change. + */ + if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || + (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { + ASSERT(tp); + code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, + capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; + } + } + + /* + * Truncate file. Must have write permission and not be a directory. + */ + if (mask & XFS_AT_SIZE) { + /* Short circuit the truncate case for zero length files */ + if ((vap->va_size == 0) && + (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + lock_flags &= ~XFS_ILOCK_EXCL; + if (mask & XFS_AT_CTIME) + xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + code = 0; + goto error_return; + } + + if (vp->v_type == VDIR) { + code = XFS_ERROR(EISDIR); + goto error_return; + } else if (vp->v_type != VREG) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + /* + * Make sure that the dquots are attached to the inode. + */ + if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED))) + goto error_return; + } + + /* + * Change file access or modified times. + */ + if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { + if (!file_owner) { + if ((flags & ATTR_UTIME) && + !capable(CAP_FOWNER)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } + } + + /* + * Change extent size or realtime flag. + */ + if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { + /* + * Can't change extent size if any extents are allocated. + */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + (mask & XFS_AT_EXTSIZE) && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + vap->va_extsize) ) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * Can't set extent size unless the file is marked, or + * about to be marked as a realtime file. + * + * This check will be removed when fixed size extents + * with buffered data writes is implemented. + * + */ + if ((mask & XFS_AT_EXTSIZE) && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + vap->va_extsize) && + (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) || + ((mask & XFS_AT_XFLAGS) && + (vap->va_xflags & XFS_XFLAG_REALTIME))))) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + + /* + * Can't change realtime flag if any extents are allocated. + */ + if (ip->i_d.di_nextents && (mask & XFS_AT_XFLAGS) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != + (vap->va_xflags & XFS_XFLAG_REALTIME)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + /* + * Extent size must be a multiple of the appropriate block + * size, if set at all. + */ + if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) { + xfs_extlen_t size; + + if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) || + ((mask & XFS_AT_XFLAGS) && + (vap->va_xflags & XFS_XFLAG_REALTIME))) { + size = mp->m_sb.sb_rextsize << + mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + } + if (vap->va_extsize % size) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + /* + * If realtime flag is set then must have realtime data. + */ + if ((mask & XFS_AT_XFLAGS) && + (vap->va_xflags & XFS_XFLAG_REALTIME)) { + if ((mp->m_sb.sb_rblocks == 0) || + (mp->m_sb.sb_rextsize == 0) || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if ((mask & XFS_AT_XFLAGS) && + (ip->i_d.di_flags & + (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || + (vap->va_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } + + /* + * Now we can make the changes. Before we join the inode + * to the transaction, if XFS_AT_SIZE is set then take care of + * the part of the truncation that must be done without the + * inode lock. This needs to be done before joining the inode + * to the transaction, because the inode cannot be unlocked + * once it is a part of the transaction. + */ + if (mask & XFS_AT_SIZE) { + code = 0; + if (vap->va_size > ip->i_d.di_size) + code = xfs_igrow_start(ip, vap->va_size, credp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (!code) + code = xfs_itruncate_data(ip, vap->va_size); + if (code) { + ASSERT(tp == NULL); + lock_flags &= ~XFS_ILOCK_EXCL; + ASSERT(lock_flags == XFS_IOLOCK_EXCL); + goto error_return; + } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + if ((code = xfs_trans_reserve(tp, 0, + XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT))) { + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return code; + } + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + if (tp) { + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ihold(tp, ip); + } + + /* determine whether mandatory locking mode changes */ + mandlock_before = MANDLOCK(vp, ip->i_d.di_mode); + + /* + * Truncate file. Must have write permission and not be a directory. + */ + if (mask & XFS_AT_SIZE) { + if (vap->va_size > ip->i_d.di_size) { + xfs_igrow_finish(tp, ip, vap->va_size, + !(flags & ATTR_DMI)); + } else if ((vap->va_size <= ip->i_d.di_size) || + ((vap->va_size == 0) && ip->i_d.di_nextents)) { + /* + * signal a sync transaction unless + * we're truncating an already unlinked + * file on a wsync filesystem + */ + code = xfs_itruncate_finish(&tp, ip, + (xfs_fsize_t)vap->va_size, + XFS_DATA_FORK, + ((ip->i_d.di_nlink != 0 || + !(mp->m_flags & XFS_MOUNT_WSYNC)) + ? 1 : 0)); + if (code) { + goto abort_return; + } + } + /* + * Have to do this even if the file's size doesn't change. + */ + timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + /* + * Change file access modes. + */ + if (mask & XFS_AT_MODE) { + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= vap->va_mode & ~S_IFMT; + + xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + timeflags |= XFS_ICHGTIME_CHG; + } + + /* + * Change file ownership. Must be the owner or privileged. + * If the system was configured with the "restricted_chown" + * option, the owner is not permitted to give away the file, + * and can change the group id only to a group of which he + * or she is a member. + */ + if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) { + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + } + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (iuid != uid) { + if (XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & XFS_AT_UID); + ASSERT(udqp); + olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = uid; + } + if (igid != gid) { + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(mask & XFS_AT_GID); + ASSERT(gdqp); + olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = gid; + } + if (iprojid != projid) { + ip->i_d.di_projid = projid; + /* + * We may have to rev the inode as well as + * the superblock version number since projids didn't + * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. + */ + if (ip->i_d.di_version == XFS_DINODE_VERSION_1) + xfs_bump_ino_vers2(tp, ip); + } + + xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + timeflags |= XFS_ICHGTIME_CHG; + } + + + /* + * Change file access or modified times. + */ + if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { + if (mask & XFS_AT_ATIME) { + ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec; + ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec; + ip->i_update_core = 1; + timeflags &= ~XFS_ICHGTIME_ACC; + } + if (mask & XFS_AT_MTIME) { + ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec; + timeflags &= ~XFS_ICHGTIME_MOD; + timeflags |= XFS_ICHGTIME_CHG; + } + if (tp && (flags & ATTR_UTIME)) + xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + } + + /* + * Change XFS-added attributes. + */ + if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { + if (mask & XFS_AT_EXTSIZE) { + /* + * Converting bytes to fs blocks. + */ + ip->i_d.di_extsize = vap->va_extsize >> + mp->m_sb.sb_blocklog; + } + if (mask & XFS_AT_XFLAGS) { + uint di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (vap->va_xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (vap->va_xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (vap->va_xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (vap->va_xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (vap->va_xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + } else { + if (vap->va_xflags & XFS_XFLAG_REALTIME) { + di_flags |= XFS_DIFLAG_REALTIME; + ip->i_iocore.io_flags |= XFS_IOCORE_RT; + } else { + ip->i_iocore.io_flags &= ~XFS_IOCORE_RT; + } + } + ip->i_d.di_flags = di_flags; + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + timeflags |= XFS_ICHGTIME_CHG; + } + + /* + * Change file inode change time only if XFS_AT_CTIME set + * AND we have been called by a DMI function. + */ + + if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) { + ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec; + ip->i_update_core = 1; + timeflags &= ~XFS_ICHGTIME_CHG; + } + + /* + * Send out timestamp changes that need to be set to the + * current time. Not done when called by a DMI function. + */ + if (timeflags && !(flags & ATTR_DMI)) + xfs_ichgtime(ip, timeflags); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesytems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + code = 0; + if (tp) { + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + code = xfs_trans_commit(tp, commit_flags, NULL); + } + + /* + * If the (regular) file's mandatory locking mode changed, then + * notify the vnode. We do this under the inode lock to prevent + * racing calls to vop_vnode_change. + */ + mandlock_after = MANDLOCK(vp, ip->i_d.di_mode); + if (mandlock_before != mandlock_after) { + VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING, + mandlock_after); + } + + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + XFS_QM_DQRELE(mp, olddquot1); + XFS_QM_DQRELE(mp, olddquot2); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + if (code) { + return code; + } + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) && + !(flags & ATTR_DMI)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, NULL, NULL, + 0, 0, AT_DELAY_FLAG(flags)); + } + return 0; + + abort_return: + commit_flags |= XFS_TRANS_ABORT; + /* FALLTHROUGH */ + error_return: + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + if (tp) { + xfs_trans_cancel(tp, commit_flags); + } + if (lock_flags != 0) { + xfs_iunlock(ip, lock_flags); + } + return code; +} + + +/* + * xfs_access + * Null conversion from vnode mode bits to inode mode bits, as in efs. + */ +STATIC int +xfs_access( + bhv_desc_t *bdp, + int mode, + cred_t *credp) +{ + xfs_inode_t *ip; + int error; + + vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, + (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_iaccess(ip, mode, credp); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + + +/* + * xfs_readlink + * + */ +STATIC int +xfs_readlink( + bhv_desc_t *bdp, + uio_t *uiop, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip; + int count; + xfs_off_t offset; + int pathlen; + vnode_t *vp; + int error = 0; + xfs_mount_t *mp; + int nmaps; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + xfs_daddr_t d; + int byte_cnt; + int n; + xfs_buf_t *bp; + + vp = BHV_TO_VNODE(bdp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + + offset = uiop->uio_offset; + count = uiop->uio_resid; + + if (offset < 0) { + error = XFS_ERROR(EINVAL); + goto error_return; + } + if (count <= 0) { + error = 0; + goto error_return; + } + + if (!(ioflags & IO_INVIS)) { + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + } + + /* + * See if the symlink is stored inline. + */ + pathlen = (int)ip->i_d.di_size; + + if (ip->i_df.if_flags & XFS_IFINLINE) { + error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop); + } + else { + /* + * Symlink not inline. Call bmap to get it in. + */ + nmaps = SYMLINK_MAPS; + + error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), + 0, NULL, 0, mval, &nmaps, NULL); + + if (error) { + goto error_return; + } + + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_buf_read(mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + error = XFS_BUF_GETERROR(bp); + if (error) { + xfs_ioerror_alert("xfs_readlink", + ip->i_mount, bp, XFS_BUF_ADDR(bp)); + xfs_buf_relse(bp); + goto error_return; + } + if (pathlen < byte_cnt) + byte_cnt = pathlen; + pathlen -= byte_cnt; + + error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop); + xfs_buf_relse (bp); + } + + } + + +error_return: + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return error; +} + + +/* + * xfs_fsync + * + * This is called to sync the inode and its data out to disk. + * We need to hold the I/O lock while flushing the data, and + * the inode lock while flushing the inode. The inode lock CANNOT + * be held while flushing the data, so acquire after we're done + * with that. + */ +STATIC int +xfs_fsync( + bhv_desc_t *bdp, + int flag, + cred_t *credp, + xfs_off_t start, + xfs_off_t stop) +{ + xfs_inode_t *ip; + xfs_trans_t *tp; + int error; + + vn_trace_entry(BHV_TO_VNODE(bdp), + __FUNCTION__, (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + + ASSERT(start >= 0 && stop >= -1); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return XFS_ERROR(EIO); + + /* + * We always need to make sure that the required inode state + * is safe on disk. The vnode might be clean but because + * of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional + * changes to the inode core that have to go to disk. + * + * The following code depends on one assumption: that + * any transaction that changes an inode logs the core + * because it has to change some field in the inode core + * (typically nextents or nblocks). That assumption + * implies that any transactions against an inode will + * catch any non-transactional updates. If inode-altering + * transactions exist that violate this assumption, the + * code breaks. Right now, it figures that if the involved + * update_* field is clear and the inode is unpinned, the + * inode is clean. Either it's been flushed or it's been + * committed and the commit has hit the disk unpinning the inode. + * (Note that xfs_inode_item_format() called at commit clears + * the update_* fields.) + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + + /* If we are flushing data then we care about update_size + * being set, otherwise we care about update_core + */ + if ((flag & FSYNC_DATA) ? + (ip->i_update_size == 0) : + (ip->i_update_core == 0)) { + /* + * Timestamps/size haven't changed since last inode + * flush or inode transaction commit. That means + * either nothing got written or a transaction + * committed which caught the updates. If the + * latter happened and the transaction hasn't + * hit the disk yet, the inode will be still + * be pinned. If it is, force the log. + */ + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (xfs_ipincount(ip)) { + xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + XFS_LOG_FORCE | + ((flag & FSYNC_WAIT) + ? XFS_LOG_SYNC : 0)); + } + error = 0; + } else { + /* + * Kick off a transaction to log the inode + * core to get the updates. Make it + * sync if FSYNC_WAIT is passed in (which + * is done by everybody but specfs). The + * sync transaction will also force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + if ((error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), + 0, 0, 0))) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed + * ourselves out of the way during trans_reserve + * which would flush the inode. But there's no + * guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer + * could be pinned anyway if it's part of an + * inode in another recent transaction. So we + * play it safe and fire off the transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (flag & FSYNC_WAIT) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + return error; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof, + * when the link count isn't zero. + */ +STATIC int +xfs_inactive_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size)); + last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + map_len = last_fsb - end_fsb; + if (map_len <= 0) + return (0); + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, + NULL, 0, &imap, &nimaps, NULL); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK)) { + /* + * Attach the dquots to the inode up front. + */ + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return (error); + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * Do the xfs_itruncate_start() call before + * reserving any log space because + * itruncate_start will call into the buffer + * cache and we can't + * do that within a transaction. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, + ip->i_d.di_size); + + error = xfs_trans_reserve(tp, 0, + XFS_ITRUNCATE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return (error); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, + XFS_IOLOCK_EXCL | + XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + error = xfs_itruncate_finish(&tp, ip, + ip->i_d.di_size, + XFS_DATA_FORK, + 0); + /* + * If we get an error at this point we + * simply don't bother truncating the file. + */ + if (error) { + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES, + NULL); + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + } + return (error); +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + int nmaps; + xfs_trans_t *ntp; + int size; + xfs_trans_t *tp; + + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + *tpp = NULL; + return error; + } + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + XFS_BMAP_INIT(&free_list, &first_block); + nmaps = sizeof(mval) / sizeof(mval[0]); + if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), + XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, + &free_list))) + goto error0; + /* + * Invalidate the block(s). + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done))) + goto error1; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed))) + goto error1; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Get a new, empty transaction to return to our caller. + */ + ntp = xfs_trans_dup(tp); + /* + * Commit the transaction containing extent freeing and EFD's. + * If we get an error on the commit here or on the reserve below, + * we need to unlock the inode since the new transaction doesn't + * have the inode attached. + */ + error = xfs_trans_commit(tp, 0, NULL); + tp = ntp; + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + /* + * Put an itruncate log reservation in the new transaction + * for our caller. + */ + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * Return with the inode locked but not joined to the transaction. + */ + *tpp = tp; + return 0; + + error1: + xfs_bmap_cancel(&free_list); + error0: + /* + * Have to come here with the inode locked and either + * (held and in the transaction) or (not in the transaction). + * If the inode isn't held then cancel would iput it, but + * that's wrong since this is inactive and the vnode ref + * count is 0 already. + * Cancel won't do anything to the inode if held, but it still + * needs to be locked until the cancel is done, if it was + * joined to the transaction. + */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + *tpp = NULL; + return error; + +} + +STATIC int +xfs_inactive_symlink_local( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + int error; + + ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink which fit into + * the inode. Just free the memory used + * to hold the old symlink. + */ + error = xfs_trans_reserve(*tpp, 0, + XFS_ITRUNCATE_LOG_RES(ip->i_mount), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + + if (error) { + xfs_trans_cancel(*tpp, 0); + *tpp = NULL; + return (error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + if (ip->i_df.if_bytes > 0) { + xfs_idata_realloc(ip, + -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + } + return (0); +} + +/* + * + */ +STATIC int +xfs_inactive_attrs( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_trans_t *tp; + int error; + xfs_mount_t *mp; + + ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_forkoff != 0); + xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_attr_inactive(ip); + if (error) { + *tpp = NULL; + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return (error); /* goto out*/ + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, 0, + XFS_IFREE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + *tpp = NULL; + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return (error); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + + *tpp = tp; + return (0); +} + +STATIC int +xfs_release( + bhv_desc_t *bdp) +{ + xfs_inode_t *ip; + vnode_t *vp; + xfs_mount_t *mp; + int error; + + vp = BHV_TO_VNODE(bdp); + ip = XFS_BHVTOI(bdp); + + if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) { + return 0; + } + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + return 0; + +#ifdef HAVE_REFCACHE + /* If we are in the NFS reference cache then don't do this now */ + if (ip->i_refcache) + return 0; +#endif + + mp = ip->i_mount; + + if (ip->i_d.di_nlink != 0) { + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) { + if ((error = xfs_inactive_free_eofblocks(mp, ip))) + return (error); + /* Update linux inode block count after free above */ + LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp, + ip->i_d.di_nblocks + ip->i_delayed_blks); + } + } + + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +STATIC int +xfs_inactive( + bhv_desc_t *bdp, + cred_t *credp) +{ + xfs_inode_t *ip; + vnode_t *vp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error; + int truncate; + + vp = BHV_TO_VNODE(bdp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0 || VN_BAD(vp)) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return VN_INACTIVE_CACHE; + } + + /* + * Only do a truncate if it's a regular file with + * some actual space in it. It's OK to look at the + * inode's fields without the lock because we're the + * only one with a reference to the inode. + */ + truncate = ((ip->i_d.di_nlink == 0) && + ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) && + ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + + mp = ip->i_mount; + + if (ip->i_d.di_nlink == 0 && + DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) { + (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL); + } + + error = 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (vp->v_vfsp->vfs_flag & VFS_RDONLY) + goto out; + + if (ip->i_d.di_nlink != 0) { + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS)) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) || + (ip->i_delayed_blks != 0))) { + if ((error = xfs_inactive_free_eofblocks(mp, ip))) + return (VN_INACTIVE_CACHE); + /* Update linux inode block count after free above */ + LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp, + ip->i_d.di_nblocks + ip->i_delayed_blks); + } + goto out; + } + + ASSERT(ip->i_d.di_nlink == 0); + + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return (VN_INACTIVE_CACHE); + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + if (truncate) { + /* + * Do the xfs_itruncate_start() call before + * reserving any log space because itruncate_start + * will call into the buffer cache and we can't + * do that within a transaction. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); + + error = xfs_trans_reserve(tp, 0, + XFS_ITRUNCATE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + /* Don't call itruncate_cleanup */ + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return (VN_INACTIVE_CACHE); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * normally, we have to run xfs_itruncate_finish sync. + * But if filesystem is wsync and we're in the inactive + * path, then we know that nlink == 0, and that the + * xaction that made nlink == 0 is permanently committed + * since xfs_remove runs as a synchronous transaction. + */ + error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, + (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0)); + + if (error) { + xfs_trans_cancel(tp, + XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + return (VN_INACTIVE_CACHE); + } + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + + /* + * If we get an error while cleaning up a + * symlink we bail out. + */ + error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ? + xfs_inactive_symlink_rmt(ip, &tp) : + xfs_inactive_symlink_local(ip, &tp); + + if (error) { + ASSERT(tp == NULL); + return (VN_INACTIVE_CACHE); + } + + xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + } else { + error = xfs_trans_reserve(tp, 0, + XFS_IFREE_LOG_RES(mp), + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return (VN_INACTIVE_CACHE); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + } + + /* + * If there are attributes associated with the file + * then blow them away now. The code calls a routine + * that recursively deconstructs the attribute fork. + * We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + error = xfs_inactive_attrs(ip, &tp); + /* + * If we got an error, the transaction is already + * cancelled, and the inode is unlocked. Just get out. + */ + if (error) + return (VN_INACTIVE_CACHE); + } else if (ip->i_afp) { + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + } + + /* + * Free the inode. + */ + XFS_BMAP_INIT(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + cmn_err(CE_NOTE, + "xfs_inactive: xfs_ifree() returned an error = %d on %s", + error, mp->m_fsname); + xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } else { + /* + * Credit the quota account(s). The inode is gone. + */ + XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is + * nothing we can do except to try to keep going. + */ + (void) xfs_bmap_finish(&tp, &free_list, first_block, + &committed); + (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + } + /* + * Release the dquots held by inode, if any. + */ + XFS_QM_DQDETACH(mp, ip); + + xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + + out: + return VN_INACTIVE_CACHE; +} + + +/* + * xfs_lookup + */ +STATIC int +xfs_lookup( + bhv_desc_t *dir_bdp, + vname_t *dentry, + vnode_t **vpp, + int flags, + vnode_t *rdir, + cred_t *credp) +{ + xfs_inode_t *dp, *ip; + xfs_ino_t e_inum; + int error; + uint lock_mode; + vnode_t *dir_vp; + + dir_vp = BHV_TO_VNODE(dir_bdp); + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + dp = XFS_BHVTOI(dir_bdp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + lock_mode = xfs_ilock_map_shared(dp); + error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip); + if (!error) { + *vpp = XFS_ITOV(ip); + ITRACE(ip); + } + xfs_iunlock_map_shared(dp, lock_mode); + return error; +} + + +/* + * xfs_create (create a new file). + */ +STATIC int +xfs_create( + bhv_desc_t *dir_bdp, + vname_t *dentry, + vattr_t *vap, + vnode_t **vpp, + cred_t *credp) +{ + char *name = VNAME(dentry); + vnode_t *dir_vp; + xfs_inode_t *dp, *ip; + vnode_t *vp=NULL; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_dev_t rdev; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + boolean_t dp_joined_to_trans; + int dm_event_sent = 0; + uint cancel_flags; + int committed; + xfs_prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + int dm_di_mode; + int namelen; + + ASSERT(!*vpp); + dir_vp = BHV_TO_VNODE(dir_bdp); + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + dp = XFS_BHVTOI(dir_bdp); + mp = dp->i_mount; + + dm_di_mode = vap->va_mode|VTTOIF(vap->va_type); + namelen = VNAMELEN(dentry); + + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, + dir_vp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, name, NULL, + dm_di_mode, 0, 0); + + if (error) + return error; + dm_event_sent = 1; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* Return through std_return after this point. */ + + udqp = gdqp = NULL; + if (vap->va_mask & XFS_AT_PROJID) + prid = (xfs_prid_t)vap->va_projid; + else + prid = (xfs_prid_t)dfltprid; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = XFS_QM_DQVOPALLOC(mp, dp, + current_fsuid(credp), current_fsgid(credp), + XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + ip = NULL; + dp_joined_to_trans = B_FALSE; + + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_CREATE_SPACE_RES(mp, namelen); + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + dp = NULL; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + + XFS_BMAP_INIT(&free_list, &first_block); + + ASSERT(ip == NULL); + + /* + * Reserve disk quota and the inode. + */ + error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + if (resblks == 0 && + (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen))) + goto error_return; + rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0; + error = xfs_dir_ialloc(&tp, dp, + MAKEIMODE(vap->va_type,vap->va_mode), 1, + rdev, credp, prid, resblks > 0, + &ip, &committed); + if (error) { + if (error == ENOSPC) + goto error_return; + goto abort_return; + } + ITRACE(ip); + + /* + * At this point, we've gotten a newly allocated inode. + * It is locked (and joined to the transaction). + */ + + ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + + /* + * Now we join the directory inode to the transaction. + * We do not do it earlier because xfs_dir_ialloc + * might commit the previous transaction (and release + * all the locks). + */ + + VN_HOLD(dir_vp); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + dp_joined_to_trans = B_TRUE; + + error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino, + &first_block, &free_list, + resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto abort_return; + } + xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + dp->i_gen++; + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); + + /* + * xfs_trans_commit normally decrements the vnode ref count + * when it unlocks the inode. Since we want to return the + * vnode to the caller, we bump the vnode ref count now. + */ + IHOLD(ip); + vp = XFS_ITOV(ip); + + error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_rele; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + IRELE(ip); + tp = NULL; + goto error_return; + } + + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + /* + * Propogate the fact that the vnode changed after the + * xfs_inode locks have been released. + */ + VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3); + + *vpp = vp; + + /* Fallthrough to std_return with error = 0 */ + +std_return: + if ( (*vpp || (error != 0 && dm_event_sent != 0)) && + DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), + DM_EVENT_POSTCREATE)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, + dir_vp, DM_RIGHT_NULL, + *vpp ? vp:NULL, + DM_RIGHT_NULL, name, NULL, + dm_di_mode, error, 0); + } + return error; + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + /* FALLTHROUGH */ + error_return: + + if (tp != NULL) + xfs_trans_cancel(tp, cancel_flags); + + if (!dp_joined_to_trans && (dp != NULL)) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + goto std_return; + + abort_rele: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + cancel_flags |= XFS_TRANS_ABORT; + xfs_trans_cancel(tp, cancel_flags); + IRELE(ip); + + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + goto std_return; +} + +#ifdef DEBUG +/* + * Some counters to see if (and how often) we are hitting some deadlock + * prevention code paths. + */ + +int xfs_rm_locks; +int xfs_rm_lock_delays; +int xfs_rm_attempts; +#endif + +/* + * The following routine will lock the inodes associated with the + * directory and the named entry in the directory. The locks are + * acquired in increasing inode number. + * + * If the entry is "..", then only the directory is locked. The + * vnode ref count will still include that from the .. entry in + * this case. + * + * There is a deadlock we need to worry about. If the locked directory is + * in the AIL, it might be blocking up the log. The next inode we lock + * could be already locked by another thread waiting for log space (e.g + * a permanent log reservation with a long running transaction (see + * xfs_itruncate_finish)). To solve this, we must check if the directory + * is in the ail and use lock_nowait. If we can't lock, we need to + * drop the inode lock on the directory and try again. xfs_iunlock will + * potentially push the tail if we were holding up the log. + */ +STATIC int +xfs_lock_dir_and_entry( + xfs_inode_t *dp, + vname_t *dentry, + xfs_inode_t *ip) /* inode of entry 'name' */ +{ + int attempts; + xfs_ino_t e_inum; + xfs_inode_t *ips[2]; + xfs_log_item_t *lp; + +#ifdef DEBUG + xfs_rm_locks++; +#endif + attempts = 0; + +again: + xfs_ilock(dp, XFS_ILOCK_EXCL); + + e_inum = ip->i_ino; + + ITRACE(ip); + + /* + * We want to lock in increasing inum. Since we've already + * acquired the lock on the directory, we may need to release + * if if the inum of the entry turns out to be less. + */ + if (e_inum > dp->i_ino) { + /* + * We are already in the right order, so just + * lock on the inode of the entry. + * We need to use nowait if dp is in the AIL. + */ + + lp = (xfs_log_item_t *)dp->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + attempts++; +#ifdef DEBUG + xfs_rm_attempts++; +#endif + + /* + * Unlock dp and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ + + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_rm_lock_delays++; +#endif + } + goto again; + } + } else { + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + } else if (e_inum < dp->i_ino) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + ips[0] = ip; + ips[1] = dp; + xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + } + /* else e_inum == dp->i_ino */ + /* This can happen if we're asked to lock /x/.. + * the entry is "..", which is also the parent directory. + */ + + return 0; +} + +#ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + +/* + * The following routine will lock n inodes in exclusive mode. + * We assume the caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock + * is in the AIL and we start waiting for another inode that is locked + * by a thread in a long running transaction (such as truncate). This can + * result in deadlock since the long running trans might need to wait + * for the inode we just locked in order to push the tail and free space + * in the log. + */ +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + int first_locked, + uint lock_mode) +{ + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; + + ASSERT(ips && (inodes >= 2)); /* we need at least two */ + + if (first_locked) { + try_lock = 1; + i = 1; + } else { + try_lock = 0; + i = 0; + } + +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + + if (i && (ips[i] == ips[i-1])) /* Already locked */ + continue; + + /* + * If try_lock is not set yet, make sure all locked inodes + * are not in the AIL. + * If any are, set try_lock to be used later. + */ + + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + try_lock++; + } + } + } + + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ + + if (try_lock) { + /* try_lock must be 0 if i is 0. */ + /* + * try_lock means we have an inode locked + * that is in the AIL. + */ + ASSERT(i != 0); + if (!xfs_ilock_nowait(ips[i], lock_mode)) { + attempts++; + + /* + * Unlock all previous guys and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ + + for(j = i - 1; j >= 0; j--) { + + /* + * Check to see if we've already + * unlocked this one. + * Not the first one going back, + * and the inode ptr is the same. + */ + if ((j != (i - 1)) && ips[j] == + ips[j+1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; + } + } else { + xfs_ilock(ips[i], lock_mode); + } + } + +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif +} + +#ifdef DEBUG +#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);} +int remove_which_error_return = 0; +#else /* ! DEBUG */ +#define REMOVE_DEBUG_TRACE(x) +#endif /* ! DEBUG */ + + +/* + * xfs_remove + * + */ +STATIC int +xfs_remove( + bhv_desc_t *dir_bdp, + vname_t *dentry, + cred_t *credp) +{ + vnode_t *dir_vp; + char *name = VNAME(dentry); + xfs_inode_t *dp, *ip; + xfs_trans_t *tp = NULL; + xfs_mount_t *mp; + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int dm_di_mode = 0; + int link_zero; + uint resblks; + int namelen; + + dir_vp = BHV_TO_VNODE(dir_bdp); + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + dp = XFS_BHVTOI(dir_bdp); + mp = dp->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + namelen = VNAMELEN(dentry); + + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, + DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, + name, NULL, 0, 0, 0); + if (error) + return error; + } + + /* From this point on, return through std_return */ + ip = NULL; + + /* + * We need to get a reference to ip before we get our log + * reservation. The reason for this is that we cannot call + * xfs_iget for an inode for which we do not have a reference + * once we've acquired a log reservation. This is because the + * inode we are trying to get might be in xfs_inactive going + * for a log reservation. Since we'll have to wait for the + * inactive code to complete before returning from xfs_iget, + * we need to make sure that we don't have log space reserved + * when we call xfs_iget. Instead we get an unlocked referece + * to the inode before getting our log reservation. + */ + error = xfs_get_dir_entry(dentry, &ip); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + goto std_return; + } + + dm_di_mode = ip->i_d.di_mode; + + vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + + ITRACE(ip); + + error = XFS_QM_DQATTACH(mp, dp, 0); + if (!error && dp != ip) + error = XFS_QM_DQATTACH(mp, ip, 0); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + IRELE(ip); + goto std_return; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); + } + if (error) { + ASSERT(error != ENOSPC); + REMOVE_DEBUG_TRACE(__LINE__); + xfs_trans_cancel(tp, 0); + IRELE(ip); + return error; + } + + error = xfs_lock_dir_and_entry(dp, dentry, ip); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + xfs_trans_cancel(tp, cancel_flags); + IRELE(ip); + goto std_return; + } + + /* + * At this point, we've gotten both the directory and the entry + * inodes locked. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + if (dp != ip) { + /* + * Increment vnode ref count only in this case since + * there's an extra vnode reference in the case where + * dp == ip. + */ + IHOLD(dp); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + } + + /* + * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. + */ + XFS_BMAP_INIT(&free_list, &first_block); + error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino, + &first_block, &free_list, 0); + if (error) { + ASSERT(error != ENOENT); + REMOVE_DEBUG_TRACE(__LINE__); + goto error1; + } + xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + dp->i_gen++; + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + error = xfs_droplink(tp, ip); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + goto error1; + } + + /* Determine if this is the last link while + * we are in the transaction. + */ + link_zero = (ip)->i_d.di_nlink==0; + + /* + * Take an extra ref on the inode so that it doesn't + * go to xfs_inactive() from within the commit. + */ + IHOLD(ip); + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + goto error_rele; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + IRELE(ip); + goto std_return; + } + + /* + * Before we drop our extra reference to the inode, purge it + * from the refcache if it is there. By waiting until afterwards + * to do the IRELE, we ensure that we won't go inactive in the + * xfs_refcache_purge_ip routine (although that would be OK). + */ + xfs_refcache_purge_ip(ip); + + vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + + /* + * Let interposed file systems know about removed links. + */ + VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero); + + IRELE(ip); + +/* Fall through to std_return with error = 0 */ + std_return: + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, + DM_EVENT_POSTREMOVE)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, + dir_vp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, + name, NULL, dm_di_mode, error, 0); + } + return error; + + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + + error_rele: + /* + * In this case make sure to not release the inode until after + * the current transaction is aborted. Releasing it beforehand + * can cause us to go to xfs_inactive and start a recursive + * transaction which can easily deadlock with the current one. + */ + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + xfs_trans_cancel(tp, cancel_flags); + + /* + * Before we drop our extra reference to the inode, purge it + * from the refcache if it is there. By waiting until afterwards + * to do the IRELE, we ensure that we won't go inactive in the + * xfs_refcache_purge_ip routine (although that would be OK). + */ + xfs_refcache_purge_ip(ip); + + IRELE(ip); + + goto std_return; +} + + +/* + * xfs_link + * + */ +STATIC int +xfs_link( + bhv_desc_t *target_dir_bdp, + vnode_t *src_vp, + vname_t *dentry, + cred_t *credp) +{ + xfs_inode_t *tdp, *sip; + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_inode_t *ips[2]; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + vnode_t *target_dir_vp; + bhv_desc_t *src_bdp; + int resblks; + char *target_name = VNAME(dentry); + int target_namelen; + + target_dir_vp = BHV_TO_VNODE(target_dir_bdp); + vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address); + vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address); + + target_namelen = VNAMELEN(dentry); + if (src_vp->v_type == VDIR) + return XFS_ERROR(EPERM); + + /* + * For now, manually find the XFS behavior descriptor for + * the source vnode. If it doesn't exist then something + * is wrong and we should just return an error. + * Eventually we need to figure out how link is going to + * work in the face of stacked vnodes. + */ + src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops); + if (src_bdp == NULL) { + return XFS_ERROR(EXDEV); + } + sip = XFS_BHVTOI(src_bdp); + tdp = XFS_BHVTOI(target_dir_bdp); + mp = tdp->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, + target_dir_vp, DM_RIGHT_NULL, + src_vp, DM_RIGHT_NULL, + target_name, NULL, 0, 0, 0); + if (error) + return error; + } + + /* Return through std_return after this point. */ + + error = XFS_QM_DQATTACH(mp, sip, 0); + if (!error && sip != tdp) + error = XFS_QM_DQATTACH(mp, tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_namelen); + error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + if (sip->i_ino < tdp->i_ino) { + ips[0] = sip; + ips[1] = tdp; + } else { + ips[0] = tdp; + ips[1] = sip; + } + + xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + + /* + * Increment vnode ref counts since xfs_trans_commit & + * xfs_trans_cancel will both unlock the inodes and + * decrement the associated ref counts. + */ + VN_HOLD(src_vp); + VN_HOLD(target_dir_vp); + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If the source has too many links, we can't make any more to it. + */ + if (sip->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto error_return; + } + + if (resblks == 0 && + (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name, + target_namelen))) + goto error_return; + + XFS_BMAP_INIT(&free_list, &first_block); + + error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen, + sip->i_ino, &first_block, &free_list, + resblks); + if (error) + goto abort_return; + xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + tdp->i_gen++; + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) { + goto abort_return; + } + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + goto std_return; + } + + /* Fall through to std_return with error = 0. */ +std_return: + if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip, + DM_EVENT_POSTLINK)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, + target_dir_vp, DM_RIGHT_NULL, + src_vp, DM_RIGHT_NULL, + target_name, NULL, 0, error, 0); + } + return error; + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + /* FALLTHROUGH */ + error_return: + xfs_trans_cancel(tp, cancel_flags); + + goto std_return; +} +/* + * xfs_mkdir + * + */ +STATIC int +xfs_mkdir( + bhv_desc_t *dir_bdp, + vname_t *dentry, + vattr_t *vap, + vnode_t **vpp, + cred_t *credp) +{ + char *dir_name = VNAME(dentry); + xfs_inode_t *dp; + xfs_inode_t *cdp; /* inode of created dir */ + vnode_t *cvp; /* vnode of created dir */ + xfs_trans_t *tp; + xfs_mount_t *mp; + int cancel_flags; + int error; + int committed; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + vnode_t *dir_vp; + boolean_t dp_joined_to_trans; + boolean_t created = B_FALSE; + int dm_event_sent = 0; + xfs_prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + int dm_di_mode; + int dir_namelen; + + dir_vp = BHV_TO_VNODE(dir_bdp); + dp = XFS_BHVTOI(dir_bdp); + mp = dp->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + dir_namelen = VNAMELEN(dentry); + + tp = NULL; + dp_joined_to_trans = B_FALSE; + dm_di_mode = vap->va_mode|VTTOIF(vap->va_type); + + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, + dir_vp, DM_RIGHT_NULL, NULL, + DM_RIGHT_NULL, dir_name, NULL, + dm_di_mode, 0, 0); + if (error) + return error; + dm_event_sent = 1; + } + + /* Return through std_return after this point. */ + + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + mp = dp->i_mount; + udqp = gdqp = NULL; + if (vap->va_mask & XFS_AT_PROJID) + prid = (xfs_prid_t)vap->va_projid; + else + prid = (xfs_prid_t)dfltprid; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = XFS_QM_DQVOPALLOC(mp, dp, + current_fsuid(credp), current_fsgid(credp), + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen); + error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_MKDIR_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + dp = NULL; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * Check for directory link count overflow. + */ + if (dp->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto error_return; + } + + /* + * Reserve disk quota and the inode. + */ + error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + if (resblks == 0 && + (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen))) + goto error_return; + /* + * create the directory inode. + */ + error = xfs_dir_ialloc(&tp, dp, + MAKEIMODE(vap->va_type,vap->va_mode), 2, + 0, credp, prid, resblks > 0, + &cdp, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto abort_return; + } + ITRACE(cdp); + + /* + * Now we add the directory inode to the transaction. + * We waited until now since xfs_dir_ialloc might start + * a new transaction. Had we joined the transaction + * earlier, the locks might have gotten released. + */ + VN_HOLD(dir_vp); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + dp_joined_to_trans = B_TRUE; + + XFS_BMAP_INIT(&free_list, &first_block); + + error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen, + cdp->i_ino, &first_block, &free_list, + resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto error1; + } + xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Bump the in memory version number of the parent directory + * so that other processes accessing it will recognize that + * the directory has changed. + */ + dp->i_gen++; + + error = XFS_DIR_INIT(mp, tp, cdp, dp); + if (error) { + goto error2; + } + + cdp->i_gen = 1; + error = xfs_bumplink(tp, dp); + if (error) { + goto error2; + } + + cvp = XFS_ITOV(cdp); + + created = B_TRUE; + + *vpp = cvp; + IHOLD(cdp); + + /* + * Attach the dquots to the new inode and modify the icount incore. + */ + XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp); + + /* + * If this is a synchronous mount, make sure that the + * mkdir transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + if (error) { + IRELE(cdp); + goto error2; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + if (error) { + IRELE(cdp); + } + + /* Fall through to std_return with error = 0 or errno from + * xfs_trans_commit. */ + +std_return: + if ( (created || (error != 0 && dm_event_sent != 0)) && + DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), + DM_EVENT_POSTCREATE)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, + dir_vp, DM_RIGHT_NULL, + created ? XFS_ITOV(cdp):NULL, + DM_RIGHT_NULL, + dir_name, NULL, + dm_di_mode, error, 0); + } + return error; + + error2: + error1: + xfs_bmap_cancel(&free_list); + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + if (!dp_joined_to_trans && (dp != NULL)) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + } + + goto std_return; +} + + +/* + * xfs_rmdir + * + */ +STATIC int +xfs_rmdir( + bhv_desc_t *dir_bdp, + vname_t *dentry, + cred_t *credp) +{ + char *name = VNAME(dentry); + xfs_inode_t *dp; + xfs_inode_t *cdp; /* child directory */ + xfs_trans_t *tp; + xfs_mount_t *mp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + vnode_t *dir_vp; + int dm_di_mode = 0; + int last_cdp_link; + int namelen; + uint resblks; + + dir_vp = BHV_TO_VNODE(dir_bdp); + dp = XFS_BHVTOI(dir_bdp); + mp = dp->i_mount; + + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount)) + return XFS_ERROR(EIO); + namelen = VNAMELEN(dentry); + + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, + dir_vp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, + name, NULL, 0, 0, 0); + if (error) + return XFS_ERROR(error); + } + + /* Return through std_return after this point. */ + + cdp = NULL; + + /* + * We need to get a reference to cdp before we get our log + * reservation. The reason for this is that we cannot call + * xfs_iget for an inode for which we do not have a reference + * once we've acquired a log reservation. This is because the + * inode we are trying to get might be in xfs_inactive going + * for a log reservation. Since we'll have to wait for the + * inactive code to complete before returning from xfs_iget, + * we need to make sure that we don't have log space reserved + * when we call xfs_iget. Instead we get an unlocked referece + * to the inode before getting our log reservation. + */ + error = xfs_get_dir_entry(dentry, &cdp); + if (error) { + REMOVE_DEBUG_TRACE(__LINE__); + goto std_return; + } + mp = dp->i_mount; + dm_di_mode = cdp->i_d.di_mode; + + /* + * Get the dquots for the inodes. + */ + error = XFS_QM_DQATTACH(mp, dp, 0); + if (!error && dp != cdp) + error = XFS_QM_DQATTACH(mp, cdp, 0); + if (error) { + IRELE(cdp); + REMOVE_DEBUG_TRACE(__LINE__); + goto std_return; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); + } + if (error) { + ASSERT(error != ENOSPC); + cancel_flags = 0; + IRELE(cdp); + goto error_return; + } + XFS_BMAP_INIT(&free_list, &first_block); + + /* + * Now lock the child directory inode and the parent directory + * inode in the proper order. This will take care of validating + * that the directory entry for the child directory inode has + * not changed while we were obtaining a log reservation. + */ + error = xfs_lock_dir_and_entry(dp, dentry, cdp); + if (error) { + xfs_trans_cancel(tp, cancel_flags); + IRELE(cdp); + goto std_return; + } + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + if (dp != cdp) { + /* + * Only increment the parent directory vnode count if + * we didn't bump it in looking up cdp. The only time + * we don't bump it is when we're looking up ".". + */ + VN_HOLD(dir_vp); + } + + ITRACE(cdp); + xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL); + + ASSERT(cdp->i_d.di_nlink >= 2); + if (cdp->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto error_return; + } + if (!XFS_DIR_ISEMPTY(mp, cdp)) { + error = XFS_ERROR(ENOTEMPTY); + goto error_return; + } + + error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino, + &first_block, &free_list, resblks); + if (error) { + goto error1; + } + + xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Bump the in memory generation count on the parent + * directory so that other can know that it has changed. + */ + dp->i_gen++; + + /* + * Drop the link from cdp's "..". + */ + error = xfs_droplink(tp, dp); + if (error) { + goto error1; + } + + /* + * Drop the link from dp to cdp. + */ + error = xfs_droplink(tp, cdp); + if (error) { + goto error1; + } + + /* + * Drop the "." link from cdp to self. + */ + error = xfs_droplink(tp, cdp); + if (error) { + goto error1; + } + + /* Determine these before committing transaction */ + last_cdp_link = (cdp)->i_d.di_nlink==0; + + /* + * Take an extra ref on the child vnode so that it + * does not go to xfs_inactive() from within the commit. + */ + IHOLD(cdp); + + /* + * If this is a synchronous mount, make sure that the + * rmdir transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + IRELE(cdp); + goto std_return; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + if (error) { + IRELE(cdp); + goto std_return; + } + + + /* + * Let interposed file systems know about removed links. + */ + VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link); + + IRELE(cdp); + + /* Fall through to std_return with error = 0 or the errno + * from xfs_trans_commit. */ +std_return: + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, + dir_vp, DM_RIGHT_NULL, + NULL, DM_RIGHT_NULL, + name, NULL, dm_di_mode, + error, 0); + } + return error; + + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + goto std_return; +} + + +/* + * xfs_readdir + * + * Read dp's entries starting at uiop->uio_offset and translate them into + * bufsize bytes worth of struct dirents starting at bufbase. + */ +STATIC int +xfs_readdir( + bhv_desc_t *dir_bdp, + uio_t *uiop, + cred_t *credp, + int *eofp) +{ + xfs_inode_t *dp; + xfs_trans_t *tp = NULL; + int error = 0; + uint lock_mode; + xfs_off_t start_offset; + + vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__, + (inst_t *)__return_address); + dp = XFS_BHVTOI(dir_bdp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) { + return XFS_ERROR(EIO); + } + + lock_mode = xfs_ilock_map_shared(dp); + start_offset = uiop->uio_offset; + error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp); + if (start_offset != uiop->uio_offset) { + xfs_ichgtime(dp, XFS_ICHGTIME_ACC); + } + xfs_iunlock_map_shared(dp, lock_mode); + return error; +} + + +/* + * xfs_symlink + * + */ +STATIC int +xfs_symlink( + bhv_desc_t *dir_bdp, + vname_t *dentry, + vattr_t *vap, + char *target_path, + vnode_t **vpp, + cred_t *credp) +{ + xfs_trans_t *tp; + xfs_mount_t *mp; + xfs_inode_t *dp; + xfs_inode_t *ip; + int error; + int pathlen; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + boolean_t dp_joined_to_trans; + vnode_t *dir_vp; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + xfs_bmbt_irec_t mval[SYMLINK_MAPS]; + xfs_daddr_t d; + char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + xfs_prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + char *link_name = VNAME(dentry); + int link_namelen; + + *vpp = NULL; + dir_vp = BHV_TO_VNODE(dir_bdp); + dp = XFS_BHVTOI(dir_bdp); + dp_joined_to_trans = B_FALSE; + error = 0; + ip = NULL; + tp = NULL; + + vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); + + mp = dp->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + link_namelen = VNAMELEN(dentry); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + if (pathlen >= MAXNAMELEN) { /* is any component too long? */ + int len, total; + char *path; + + for(total = 0, path = target_path; total < pathlen;) { + /* + * Skip any slashes. + */ + while(*path == '/') { + total++; + path++; + } + + /* + * Count up to the next slash or end of path. + * Error out if the component is bigger than MAXNAMELEN. + */ + for(len = 0; *path != '/' && total < pathlen;total++, path++) { + if (++len >= MAXNAMELEN) { + error = ENAMETOOLONG; + return error; + } + } + } + } + + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) { + error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp, + DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, + link_name, target_path, 0, 0, 0); + if (error) + return error; + } + + /* Return through std_return after this point. */ + + udqp = gdqp = NULL; + if (vap->va_mask & XFS_AT_PROJID) + prid = (xfs_prid_t)vap->va_projid; + else + prid = (xfs_prid_t)dfltprid; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = XFS_QM_DQVOPALLOC(mp, dp, + current_fsuid(credp), current_fsgid(credp), + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp)) + fs_blocks = 0; + else + fs_blocks = XFS_B_TO_FSB(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks); + error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + dp = NULL; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + if (resblks == 0 && + (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen))) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + XFS_BMAP_INIT(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT), + 1, 0, credp, prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + ITRACE(ip); + + VN_HOLD(dir_vp); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + dp_joined_to_trans = B_TRUE; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + first_fsb = 0; + nmaps = SYMLINK_MAPS; + + error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, + &first_block, resblks, mval, &nmaps, + &free_list); + if (error) { + goto error1; + } + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + ASSERT(bp && !XFS_BUF_GETERROR(bp)); + if (pathlen < byte_cnt) { + byte_cnt = pathlen; + } + pathlen -= byte_cnt; + + memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); + cur_chunk += byte_cnt; + + xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); + } + } + + /* + * Create the directory entry for the symlink. + */ + error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen, + ip->i_ino, &first_block, &free_list, resblks); + if (error) { + goto error1; + } + xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * Bump the in memory version number of the parent directory + * so that other processes accessing it will recognize that + * the directory has changed. + */ + dp->i_gen++; + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + /* + * xfs_trans_commit normally decrements the vnode ref count + * when it unlocks the inode. Since we want to return the + * vnode to the caller, we bump the vnode ref count now. + */ + IHOLD(ip); + + error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + /* Fall through to std_return with error = 0 or errno from + * xfs_trans_commit */ +std_return: + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), + DM_EVENT_POSTSYMLINK)) { + (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, + dir_vp, DM_RIGHT_NULL, + error ? NULL : XFS_ITOV(ip), + DM_RIGHT_NULL, link_name, target_path, + 0, error, 0); + } + + if (!error) { + vnode_t *vp; + + ASSERT(ip); + vp = XFS_ITOV(ip); + *vpp = vp; + } + return error; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + XFS_QM_DQRELE(mp, udqp); + XFS_QM_DQRELE(mp, gdqp); + + if (!dp_joined_to_trans && (dp != NULL)) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + } + + goto std_return; +} + + +/* + * xfs_fid2 + * + * A fid routine that takes a pointer to a previously allocated + * fid structure (like xfs_fast_fid) but uses a 64 bit inode number. + */ +STATIC int +xfs_fid2( + bhv_desc_t *bdp, + fid_t *fidp) +{ + xfs_inode_t *ip; + xfs_fid2_t *xfid; + + vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, + (inst_t *)__return_address); + ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t)); + + xfid = (xfs_fid2_t *)fidp; + ip = XFS_BHVTOI(bdp); + xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len); + xfid->fid_pad = 0; + /* + * use memcpy because the inode is a long long and there's no + * assurance that xfid->fid_ino is properly aligned. + */ + memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino)); + xfid->fid_gen = ip->i_d.di_gen; + + return 0; +} + + +/* + * xfs_rwlock + */ +int +xfs_rwlock( + bhv_desc_t *bdp, + vrwlock_t locktype) +{ + xfs_inode_t *ip; + vnode_t *vp; + + vp = BHV_TO_VNODE(bdp); + if (vp->v_type == VDIR) + return 1; + ip = XFS_BHVTOI(bdp); + if (locktype == VRWLOCK_WRITE) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + } else if (locktype == VRWLOCK_TRY_READ) { + return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)); + } else if (locktype == VRWLOCK_TRY_WRITE) { + return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)); + } else { + ASSERT((locktype == VRWLOCK_READ) || + (locktype == VRWLOCK_WRITE_DIRECT)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } + + return 1; +} + + +/* + * xfs_rwunlock + */ +void +xfs_rwunlock( + bhv_desc_t *bdp, + vrwlock_t locktype) +{ + xfs_inode_t *ip; + vnode_t *vp; + + vp = BHV_TO_VNODE(bdp); + if (vp->v_type == VDIR) + return; + ip = XFS_BHVTOI(bdp); + if (locktype == VRWLOCK_WRITE) { + /* + * In the write case, we may have added a new entry to + * the reference cache. This might store a pointer to + * an inode to be released in this inode. If it is there, + * clear the pointer and release the inode after unlocking + * this one. + */ + xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL); + } else { + ASSERT((locktype == VRWLOCK_READ) || + (locktype == VRWLOCK_WRITE_DIRECT)); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + } + return; +} + +STATIC int +xfs_inode_flush( + bhv_desc_t *bdp, + int flags) +{ + xfs_inode_t *ip; + xfs_mount_t *mp; + xfs_inode_log_item_t *iip; + int error = 0; + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + iip = ip->i_itemp; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Bypass inodes which have already been cleaned by + * the inode flush clustering code inside xfs_iflush + */ + if ((ip->i_update_core == 0) && + ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) + return 0; + + if (flags & FLUSH_LOG) { + if (iip && iip->ili_last_lsn) { + xlog_t *log = mp->m_log; + xfs_lsn_t sync_lsn; + int s, log_flags = XFS_LOG_FORCE; + + s = GRANT_LOCK(log); + sync_lsn = log->l_last_sync_lsn; + GRANT_UNLOCK(log, s); + + if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0)) + return 0; + + if (flags & FLUSH_SYNC) + log_flags |= XFS_LOG_SYNC; + return xfs_log_force(mp, iip->ili_last_lsn, log_flags); + } + } + + /* + * We make this non-blocking if the inode is contended, + * return EAGAIN to indicate to the caller that they + * did not succeed. This prevents the flush path from + * blocking on inodes inside another operation right + * now, they get caught later by xfs_sync. + */ + if (flags & FLUSH_INODE) { + int flush_flags; + + if (xfs_ipincount(ip)) + return EAGAIN; + + if (flags & FLUSH_SYNC) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return EAGAIN; + } + } else { + return EAGAIN; + } + + if (flags & FLUSH_SYNC) + flush_flags = XFS_IFLUSH_SYNC; + else + flush_flags = XFS_IFLUSH_ASYNC; + + error = xfs_iflush(ip, flush_flags); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + } + + return error; +} + + +int +xfs_set_dmattrs ( + bhv_desc_t *bdp, + u_int evmask, + u_int16_t state, + cred_t *credp) +{ + xfs_inode_t *ip; + xfs_trans_t *tp; + xfs_mount_t *mp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask; + ip->i_iocore.io_dmstate = ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + IHOLD(ip); + error = xfs_trans_commit(tp, 0, NULL); + + return error; +} + + +/* + * xfs_reclaim + */ +STATIC int +xfs_reclaim( + bhv_desc_t *bdp) +{ + xfs_inode_t *ip; + vnode_t *vp; + + vp = BHV_TO_VNODE(bdp); + ip = XFS_BHVTOI(bdp); + + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + ASSERT(!VN_MAPPED(vp)); + + /* bad inode, get out here ASAP */ + if (VN_BAD(vp)) { + xfs_ireclaim(ip); + return 0; + } + + if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (ip->i_d.di_size > 0) { + /* + * Flush and invalidate any data left around that is + * a part of this file. + * + * Get the inode's i/o lock so that buffers are pushed + * out while holding the proper lock. We can't hold + * the inode lock here since flushing out buffers may + * cause us to try to get the lock in xfs_strategy(). + * + * We don't have to call remapf() here, because there + * cannot be any mapped file references to this vnode + * since it is being reclaimed. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + /* + * If we hit an IO error, we need to make sure that the + * buffer and page caches of file data for + * the file are tossed away. We don't want to use + * VOP_FLUSHINVAL_PAGES here because we don't want dirty + * pages to stay attached to the vnode, but be + * marked P_BAD. pdflush/vnode_pagebad + * hates that. + */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE); + } else { + VOP_TOSS_PAGES(vp, 0, -1, FI_NONE); + } + + ASSERT(VN_CACHED(vp) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || + ip->i_delayed_blks == 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + /* + * di_size field may not be quite accurate if we're + * shutting down. + */ + VOP_TOSS_PAGES(vp, 0, -1, FI_NONE); + ASSERT(VN_CACHED(vp) == 0); + } + } + + /* If we have nothing to flush with this inode then complete the + * teardown now, otherwise break the link between the xfs inode + * and the linux inode and clean up the xfs inode later. This + * avoids flushing the inode to disk during the delete operation + * itself. + */ + if (!ip->i_update_core && (ip->i_itemp == NULL)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); + } else { + xfs_mount_t *mp = ip->i_mount; + + /* Protect sync from us */ + XFS_MOUNT_ILOCK(mp); + vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); + list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); + ip->i_flags |= XFS_IRECLAIMABLE; + XFS_MOUNT_IUNLOCK(mp); + } + return 0; +} + +int +xfs_finish_reclaim( + xfs_inode_t *ip, + int locked, + int sync_mode) +{ + xfs_ihash_t *ih = ip->i_hash; + vnode_t *vp = XFS_ITOV_NULL(ip); + int error; + + if (vp && VN_BAD(vp)) + goto reclaim; + + /* The hash lock here protects a thread in xfs_iget_core from + * racing with us on linking the inode back with a vnode. + * Once we have the XFS_IRECLAIM flag set it will not touch + * us. + */ + write_lock(&ih->ih_lock); + if ((ip->i_flags & XFS_IRECLAIM) || + (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) { + write_unlock(&ih->ih_lock); + if (locked) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + return(1); + } + ip->i_flags |= XFS_IRECLAIM; + write_unlock(&ih->ih_lock); + + /* + * If the inode is still dirty, then flush it out. If the inode + * is not in the AIL, then it will be OK to flush it delwri as + * long as xfs_iflush() does not keep any references to the inode. + * We leave that decision up to xfs_iflush() since it has the + * knowledge of whether it's OK to simply do a delwri flush of + * the inode or whether we need to wait until the inode is + * pulled from the AIL. + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (!locked) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + } + + if (ip->i_update_core || + ((ip->i_itemp != NULL) && + (ip->i_itemp->ili_format.ilf_fields != 0))) { + error = xfs_iflush(ip, sync_mode); + /* + * If we hit an error, typically because of filesystem + * shutdown, we don't need to let vn_reclaim to know + * because we're gonna reclaim the inode anyway. + */ + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto reclaim; + } + xfs_iflock(ip); /* synchronize with xfs_iflush_done */ + } + + ASSERT(ip->i_update_core == 0); + ASSERT(ip->i_itemp == NULL || + ip->i_itemp->ili_format.ilf_fields == 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else if (locked) { + /* + * We are not interested in doing an iflush if we're + * in the process of shutting down the filesystem forcibly. + * So, just reclaim the inode. + */ + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + reclaim: + xfs_ireclaim(ip); + return 0; +} + +int +xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock) +{ + int purged; + xfs_inode_t *ip, *n; + int done = 0; + + while (!done) { + purged = 0; + XFS_MOUNT_ILOCK(mp); + list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) { + if (noblock) { + if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) + continue; + if (xfs_ipincount(ip) || + !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + } + XFS_MOUNT_IUNLOCK(mp); + xfs_finish_reclaim(ip, noblock, + XFS_IFLUSH_DELWRI_ELSE_ASYNC); + purged = 1; + break; + } + + done = !purged; + } + + XFS_MOUNT_IUNLOCK(mp); + return 0; +} + +/* + * xfs_alloc_file_space() + * This routine allocates disk space for the given file. + * + * If alloc_type == 0, this request is for an ALLOCSP type + * request which will change the file size. In this case, no + * DMAPI event will be generated by the call. A TRUNCATE event + * will be generated later by xfs_setattr. + * + * If alloc_type != 0, this request is for a RESVSP type + * request, and a DMAPI DM_EVENT_WRITE will be generated if the + * lower block boundary byte address is less than the file's + * length. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_alloc_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type, + int attr_flags) +{ + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + int committed; + xfs_off_t count; + xfs_filblks_t datablocks; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t *imapp; + xfs_bmbt_irec_t imaps[1]; + xfs_mount_t *mp; + int numrtextents; + int reccount; + uint resblks; + int rt; + int rtextsize; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + int xfs_bmapi_flags; + + vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * determine if this is a realtime file + */ + if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) { + if (ip->i_d.di_extsize) + rtextsize = ip->i_d.di_extsize; + else + rtextsize = mp->m_sb.sb_rextsize; + } else + rtextsize = 0; + + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return error; + + if (len <= 0) + return XFS_ERROR(EINVAL); + + count = len; + error = 0; + imapp = &imaps[0]; + reccount = 1; + xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* Generate a DMAPI event if needed. */ + if (alloc_type != 0 && offset < ip->i_d.di_size && + (attr_flags&ATTR_DMI) == 0 && + DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { + xfs_off_t end_dmi_offset; + + end_dmi_offset = offset+len; + if (end_dmi_offset > ip->i_d.di_size) + end_dmi_offset = ip->i_d.di_size; + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), + offset, end_dmi_offset - offset, + 0, NULL); + if (error) + return(error); + } + + /* + * allocate file space until done or until there is an error + */ +retry: + while (allocatesize_fsb && !error) { + /* + * determine if reserving space on + * the data or realtime partition. + */ + if (rt) { + xfs_fileoff_t s, e; + + s = startoffset_fsb; + do_div(s, rtextsize); + s *= rtextsize; + e = roundup_64(startoffset_fsb + allocatesize_fsb, + rtextsize); + numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize; + datablocks = 0; + } else { + datablocks = allocatesize_fsb; + numrtextents = 0; + } + + /* + * allocate and setup the transaction + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); + error = xfs_trans_reserve(tp, + resblks, + XFS_WRITE_LOG_RES(mp), + numrtextents, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, + ip->i_udquot, ip->i_gdquot, resblks, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * issue the bmapi() call to allocate the blocks + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + error = xfs_bmapi(tp, ip, startoffset_fsb, + allocatesize_fsb, xfs_bmapi_flags, + &firstfsb, 0, imapp, &reccount, + &free_list); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (reccount == 0) { + error = XFS_ERROR(ENOSPC); + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } +dmapi_enospc_check: + if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 && + DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) { + + error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, + XFS_ITOV(ip), DM_RIGHT_NULL, + XFS_ITOV(ip), DM_RIGHT_NULL, + NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ + if (error == 0) + goto retry; /* Maybe DMAPI app. has made space */ + /* else fall through with error from XFS_SEND_DATA */ + } + + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto dmapi_enospc_check; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, + ip->i_d.di_flags & XFS_DIFLAG_REALTIME ? + mp->m_rtdev_targp : mp->m_ddev_targp); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap, + &nimap, NULL); + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + XFS_BUF_UNDONE(bp); + XFS_BUF_UNWRITE(bp); + XFS_BUF_READ(bp); + XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); + xfsbdstrat(mp, bp); + if ((error = xfs_iowait(bp))) { + xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", + mp, bp, XFS_BUF_ADDR(bp)); + break; + } + memset(XFS_BUF_PTR(bp) + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + XFS_BUF_UNDONE(bp); + XFS_BUF_UNREAD(bp); + XFS_BUF_WRITE(bp); + xfsbdstrat(mp, bp); + if ((error = xfs_iowait(bp))) { + xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", + mp, bp, XFS_BUF_ADDR(bp)); + break; + } + } + xfs_buf_free(bp); + return error; +} + +/* + * xfs_free_file_space() + * This routine frees disk space for the given file. + * + * This routine is only called by xfs_change_file_space + * for an UNRESVSP type call. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +STATIC int +xfs_free_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + int committed; + int done; + xfs_off_t end_dmi_offset; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_off_t ilen; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + int rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + int need_iolock = (attr_flags & ATTR_DMI) == 0; + + vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + mp = ip->i_mount; + + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + end_dmi_offset = offset + len; + endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); + + if (offset < ip->i_d.di_size && + (attr_flags & ATTR_DMI) == 0 && + DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { + if (end_dmi_offset > ip->i_d.di_size) + end_dmi_offset = ip->i_d.di_size; + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), + offset, end_dmi_offset - offset, + AT_DELAY_FLAG(attr_flags), NULL); + if (error) + return(error); + } + + if (need_iolock) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog), + (__uint8_t)NBPP); + ilen = len + (offset & (rounding - 1)); + ioffset = offset & ~(rounding - 1); + if (ilen & (rounding - 1)) + ilen = (ilen + rounding) & ~(rounding - 1); + xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0); + /* + * Need to zero the stuff we're not freeing, on disk. + * If its a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0, + &imap, &nimap, NULL); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0, + &imap, &nimap, NULL); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, + resblks, + XFS_WRITE_LOG_RES(mp), + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = XFS_TRANS_RESERVE_QUOTA(mp, tp, + ip->i_udquot, ip->i_gdquot, resblks, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + /* + * issue the bunmapi() call to free the blocks + */ + XFS_BMAP_INIT(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out_unlock_iolock: + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : + XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_change_file_space() + * This routine allocates or frees disk space for the given file. + * The user specified parameters are checked for alignment and size + * limitations. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_change_file_space( + bhv_desc_t *bdp, + int cmd, + xfs_flock64_t *bf, + xfs_off_t offset, + cred_t *credp, + int attr_flags) +{ + int clrprealloc; + int error; + xfs_fsize_t fsize; + xfs_inode_t *ip; + xfs_mount_t *mp; + int setprealloc; + xfs_off_t startoffset; + xfs_off_t llen; + xfs_trans_t *tp; + vattr_t va; + vnode_t *vp; + + vp = BHV_TO_VNODE(bdp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + /* + * must be a regular file and have write permission + */ + if (vp->v_type != VREG) + return XFS_ERROR(EINVAL); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + if ((error = xfs_iaccess(ip, S_IWUSR, credp))) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; + } + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += offset; + break; + case 2: /*SEEK_END*/ + bf->l_start += ip->i_d.di_size; + break; + default: + return XFS_ERROR(EINVAL); + } + + llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + + if ( (bf->l_start < 0) + || (bf->l_start > XFS_MAXIOFFSET(mp)) + || (bf->l_start + llen < 0) + || (bf->l_start + llen > XFS_MAXIOFFSET(mp))) + return XFS_ERROR(EINVAL); + + bf->l_whence = 0; + + startoffset = bf->l_start; + fsize = ip->i_d.di_size; + + /* + * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve + * file space. + * These calls do NOT zero the data space allocated to the file, + * nor do they change the file size. + * + * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file + * space. + * These calls cause the new file data to be zeroed and the file + * size to be changed. + */ + setprealloc = clrprealloc = 0; + + switch (cmd) { + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, startoffset, bf->l_len, + 1, attr_flags); + if (error) + return error; + setprealloc = 1; + break; + + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, + attr_flags))) + return error; + break; + + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + if (startoffset > fsize) { + error = xfs_alloc_file_space(ip, fsize, + startoffset - fsize, 0, attr_flags); + if (error) + break; + } + + va.va_mask = XFS_AT_SIZE; + va.va_size = startoffset; + + error = xfs_setattr(bdp, &va, attr_flags, credp); + + if (error) + return error; + + clrprealloc = 1; + break; + + default: + ASSERT(0); + return XFS_ERROR(EINVAL); + } + + /* + * update the inode timestamp, mode, and prealloc flag bits + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + + if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), + 0, 0, 0))) { + /* ASSERT(0); */ + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + + if ((attr_flags & ATTR_DMI) == 0) { + ip->i_d.di_mode &= ~S_ISUID; + + /* + * Note that we don't have to worry about mandatory + * file locking being disabled here because we only + * clear the S_ISGID bit if the Group execute bit is + * on, but if it was on then mandatory locking wouldn't + * have been enabled. + */ + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0, NULL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +vnodeops_t xfs_vnodeops = { + BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS), + .vop_open = xfs_open, + .vop_read = xfs_read, +#ifdef HAVE_SENDFILE + .vop_sendfile = xfs_sendfile, +#endif + .vop_write = xfs_write, + .vop_ioctl = xfs_ioctl, + .vop_getattr = xfs_getattr, + .vop_setattr = xfs_setattr, + .vop_access = xfs_access, + .vop_lookup = xfs_lookup, + .vop_create = xfs_create, + .vop_remove = xfs_remove, + .vop_link = xfs_link, + .vop_rename = xfs_rename, + .vop_mkdir = xfs_mkdir, + .vop_rmdir = xfs_rmdir, + .vop_readdir = xfs_readdir, + .vop_symlink = xfs_symlink, + .vop_readlink = xfs_readlink, + .vop_fsync = xfs_fsync, + .vop_inactive = xfs_inactive, + .vop_fid2 = xfs_fid2, + .vop_rwlock = xfs_rwlock, + .vop_rwunlock = xfs_rwunlock, + .vop_bmap = xfs_bmap, + .vop_reclaim = xfs_reclaim, + .vop_attr_get = xfs_attr_get, + .vop_attr_set = xfs_attr_set, + .vop_attr_remove = xfs_attr_remove, + .vop_attr_list = xfs_attr_list, + .vop_link_removed = (vop_link_removed_t)fs_noval, + .vop_vnode_change = (vop_vnode_change_t)fs_noval, + .vop_tosspages = fs_tosspages, + .vop_flushinval_pages = fs_flushinval_pages, + .vop_flush_pages = fs_flush_pages, + .vop_release = xfs_release, + .vop_iflush = xfs_inode_flush, +}; -- cgit