From 32ce90a4b79155a155de2b284d8b69023e5e8fea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Apr 2012 15:58:32 +1000 Subject: xfs: remove log item from AIL in xfs_iflush after a shutdown If a filesystem has been forced shutdown we are never going to write inodes to disk, which means the inode items will stay in the AIL until we free the inode. Currently that is not a problem, but a pending change requires us to empty the AIL before shutting down the filesystem. In that case leaving the inode in the AIL is lethal. Make sure to remove the log item from the AIL to allow emptying the AIL on shutdown filesystems. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bc46c0a133d3..00f9c2f34e1f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2397,7 +2397,6 @@ xfs_iflush( xfs_inode_t *ip, uint flags) { - xfs_inode_log_item_t *iip; xfs_buf_t *bp; xfs_dinode_t *dip; xfs_mount_t *mp; @@ -2410,7 +2409,6 @@ xfs_iflush( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; mp = ip->i_mount; /* @@ -2447,13 +2445,14 @@ xfs_iflush( /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - if (iip) - iip->ili_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); + error = XFS_ERROR(EIO); + goto abort_out; } /* @@ -2500,11 +2499,13 @@ corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: /* * Unlocks the flush lock */ xfs_iflush_abort(ip); - return XFS_ERROR(EFSCORRUPTED); + return error; } -- cgit From 4c46819a8097a75d3b378c5e56d2bcf47bb7408d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Apr 2012 15:58:36 +1000 Subject: xfs: do not write the buffer from xfs_iflush Instead of writing the buffer directly from inside xfs_iflush return it to the caller and let the caller decide what to do with the buffer. Also remove the pincount check in xfs_iflush that all non-blocking callers already implement and the now unused flags parameter. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 54 ++++++++++++++++-------------------------------------- 1 file changed, 16 insertions(+), 38 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00f9c2f34e1f..0fa987dea242 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2384,22 +2384,22 @@ cluster_corrupt_out: } /* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush completion must be - * active as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush will be completed when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. */ int xfs_iflush( - xfs_inode_t *ip, - uint flags) + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; int error; XFS_STATS_INC(xs_iflush_count); @@ -2409,24 +2409,8 @@ xfs_iflush( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - mp = ip->i_mount; + *bpp = NULL; - /* - * We can't flush the inode until it is unpinned, so wait for it if we - * are allowed to block. We know no one new can pin it, because we are - * holding the inode lock shared and you need to hold it exclusively to - * pin the inode. - * - * If we are not allowed to block, force the log out asynchronously so - * that when we come back the inode will be unpinned. If other inodes - * in the same cluster are dirty, they will probably write the inode - * out for us if they occur after the log force completes. - */ - if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin(ip); - xfs_ifunlock(ip); - return EAGAIN; - } xfs_iunpin_wait(ip); /* @@ -2458,8 +2442,7 @@ xfs_iflush( /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2487,13 +2470,8 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - return error; + *bpp = bp; + return 0; corrupt_out: xfs_buf_relse(bp); -- cgit From 43ff2122e6492bcc88b065c433453dce88223b30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Apr 2012 15:58:39 +1000 Subject: xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0fa987dea242..acd846d808b2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2347,11 +2347,11 @@ cluster_corrupt_out: */ rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); if (bufwasdelwri) xfs_buf_relse(bp); @@ -2685,27 +2685,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } -void -xfs_promote_inode( - struct xfs_inode *ip) -{ - struct xfs_buf *bp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - - bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, - ip->i_imap.im_len, XBF_TRYLOCK); - if (!bp) - return; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_buf_delwri_promote(bp); - wake_up_process(ip->i_mount->m_ddev_targp->bt_task); - } - - xfs_buf_relse(bp); -} - /* * Return a pointer to the extent record at file index idx. */ -- cgit From 04913fdd91f342e537005ef1233f98068b925a7f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Apr 2012 15:58:41 +1000 Subject: xfs: pass shutdown method into xfs_trans_ail_delete_bulk xfs_trans_ail_delete_bulk() can be called from different contexts so if the item is not in the AIL we need different shutdown for each context. Pass in the shutdown method needed so the correct action can be taken. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index acd846d808b2..65d7d994d499 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2377,7 +2377,7 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq); + xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); @@ -2482,7 +2482,7 @@ abort_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(ip); + xfs_iflush_abort(ip, false); return error; } -- cgit From a8acad70731e7d0585f25f33f8a009176f001f70 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Apr 2012 15:58:54 +1000 Subject: xfs: kill XBF_LOCK Buffers are always returned locked from the lookup routines. Hence we don't need to tell the lookup routines to return locked buffers, on to try and lock them. Remove XBF_LOCK from all the callers and from internal buffer cache usage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 65d7d994d499..f64b482a7953 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -226,7 +226,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); if (error) return error; @@ -782,8 +782,7 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XBF_LOCK, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1342,7 +1341,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -1423,7 +1422,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp() returned error %d.", __func__, error); @@ -1484,7 +1483,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", __func__, error); @@ -1566,8 +1565,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!bp) return ENOMEM; @@ -1737,7 +1735,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); if (error) return error; -- cgit From 2a0ec1d9ed7f3aa7974fccfbb612fadda2e10bad Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Apr 2012 15:59:02 +1000 Subject: xfs: move xfs_get_extsz_hint() and kill xfs_rw.h The only thing left in xfs_rw.h is a function prototype for an inode function. Move that to xfs_inode.h, and kill xfs_rw.h. Also move the function implementing the prototype from xfs_rw.c to xfs_inode.c so we only have one function left in xfs_rw.c Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f64b482a7953..c6fb5f0577b8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -61,6 +61,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer -- cgit From ad1e95c54eb3980ab2b4683fba29ad0ef954ec51 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Apr 2012 15:59:04 +1000 Subject: xfs: clean up xfs_bit.h includes With the removal of xfs_rw.h and other changes over time, xfs_bit.h is being included in many files that don't actually need it. Clean up the includes as necessary. Also move the only-used-once xfs_ialloc_find_free() static inline function out of a header file that is widely included to reduce the number of needless dependencies on xfs_bit.h. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c6fb5f0577b8..72ec1a4cc477 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,7 +20,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" -- cgit From 611c99468c7aa1a5c2bb6d46e7b5d8e53eecfefd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Apr 2012 15:59:07 +1000 Subject: xfs: make XBF_MAPPED the default behaviour Rather than specifying XBF_MAPPED for almost all buffers, introduce XBF_UNMAPPED for the couple of users that use unmapped buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 72ec1a4cc477..a59eea09930a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -150,6 +150,7 @@ xfs_imap_to_bp( int ni; xfs_buf_t *bp; + buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp); if (error) { -- cgit