summaryrefslogtreecommitdiffstats
path: root/ext4-fix-races-between-page-faults-and-hole-punching.patch
diff options
context:
space:
mode:
Diffstat (limited to 'ext4-fix-races-between-page-faults-and-hole-punching.patch')
-rw-r--r--ext4-fix-races-between-page-faults-and-hole-punching.patch442
1 files changed, 442 insertions, 0 deletions
diff --git a/ext4-fix-races-between-page-faults-and-hole-punching.patch b/ext4-fix-races-between-page-faults-and-hole-punching.patch
new file mode 100644
index 000000000..9034b958c
--- /dev/null
+++ b/ext4-fix-races-between-page-faults-and-hole-punching.patch
@@ -0,0 +1,442 @@
+From ea3d7209ca01da209cda6f0dea8be9cc4b7a933b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.com>
+Date: Mon, 7 Dec 2015 14:28:03 -0500
+Subject: [PATCH 1/4] ext4: fix races between page faults and hole punching
+
+Currently, page faults and hole punching are completely unsynchronized.
+This can result in page fault faulting in a page into a range that we
+are punching after truncate_pagecache_range() has been called and thus
+we can end up with a page mapped to disk blocks that will be shortly
+freed. Filesystem corruption will shortly follow. Note that the same
+race is avoided for truncate by checking page fault offset against
+i_size but there isn't similar mechanism available for punching holes.
+
+Fix the problem by creating new rw semaphore i_mmap_sem in inode and
+grab it for writing over truncate, hole punching, and other functions
+removing blocks from extent tree and for read over page faults. We
+cannot easily use i_data_sem for this since that ranks below transaction
+start and we need something ranking above it so that it can be held over
+the whole truncate / hole punching operation. Also remove various
+workarounds we had in the code to reduce race window when page fault
+could have created pages with stale mapping information.
+
+Signed-off-by: Jan Kara <jack@suse.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h | 10 +++++++++
+ fs/ext4/extents.c | 54 ++++++++++++++++++++++++--------------------
+ fs/ext4/file.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++--------
+ fs/ext4/inode.c | 36 +++++++++++++++++++++--------
+ fs/ext4/super.c | 1 +
+ fs/ext4/truncate.h | 2 ++
+ 6 files changed, 127 insertions(+), 42 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index cc7ca4e87144..348a5ff4a0e2 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -910,6 +910,15 @@ struct ext4_inode_info {
+ * by other means, so we have i_data_sem.
+ */
+ struct rw_semaphore i_data_sem;
++ /*
++ * i_mmap_sem is for serializing page faults with truncate / punch hole
++ * operations. We have to make sure that new page cannot be faulted in
++ * a section of the inode that is being punched. We cannot easily use
++ * i_data_sem for this since we need protection for the whole punch
++ * operation and i_data_sem ranks below transaction start so we have
++ * to occasionally drop it.
++ */
++ struct rw_semaphore i_mmap_sem;
+ struct inode vfs_inode;
+ struct jbd2_inode *jinode;
+
+@@ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+ extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+ loff_t lstart, loff_t lend);
+ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
++extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+ extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+ extern void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim);
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 551353b1b17a..5be9ca5a8a7a 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
+ int partial_begin, partial_end;
+ loff_t start, end;
+ ext4_lblk_t lblk;
+- struct address_space *mapping = inode->i_mapping;
+ unsigned int blkbits = inode->i_blkbits;
+
+ trace_ext4_zero_range(inode, offset, len, mode);
+@@ -4786,17 +4785,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
+ }
+
+ /*
+- * Write out all dirty pages to avoid race conditions
+- * Then release them.
+- */
+- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+- ret = filemap_write_and_wait_range(mapping, offset,
+- offset + len - 1);
+- if (ret)
+- return ret;
+- }
+-
+- /*
+ * Round up offset. This is not fallocate, we neet to zero out
+ * blocks, so convert interior block aligned part of the range to
+ * unwritten and possibly manually zero out unaligned parts of the
+@@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
+
+- /* Now release the pages and zero block aligned part of pages*/
+- truncate_pagecache_range(inode, start, end - 1);
+- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+-
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
++ /*
++ * Prevent page faults from reinstantiating pages we have
++ * released from page cache.
++ */
++ down_write(&EXT4_I(inode)->i_mmap_sem);
++ /* Now release the pages and zero block aligned part of pages */
++ truncate_pagecache_range(inode, start, end - 1);
++ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
++
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+ flags, mode);
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (ret)
+ goto out_dio;
+ }
+@@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+ goto out_mutex;
+ }
+
+- truncate_pagecache(inode, ioffset);
+-
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
++ /*
++ * Prevent page faults from reinstantiating pages we have released from
++ * page cache.
++ */
++ down_write(&EXT4_I(inode)->i_mmap_sem);
++ truncate_pagecache(inode, ioffset);
++
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+- goto out_dio;
++ goto out_mmap;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+@@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+
+ out_stop:
+ ext4_journal_stop(handle);
+-out_dio:
++out_mmap:
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_inode_resume_unlocked_dio(inode);
+ out_mutex:
+ mutex_unlock(&inode->i_mutex);
+@@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+ goto out_mutex;
+ }
+
+- truncate_pagecache(inode, ioffset);
+-
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
++ /*
++ * Prevent page faults from reinstantiating pages we have released from
++ * page cache.
++ */
++ down_write(&EXT4_I(inode)->i_mmap_sem);
++ truncate_pagecache(inode, ioffset);
++
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+- goto out_dio;
++ goto out_mmap;
+ }
+
+ /* Expand file to avoid data loss if there is error while shifting */
+@@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+
+ out_stop:
+ ext4_journal_stop(handle);
+-out_dio:
++out_mmap:
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_inode_resume_unlocked_dio(inode);
+ out_mutex:
+ mutex_unlock(&inode->i_mutex);
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 113837e7ba98..0d24ebcd7c9e 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ {
+ int result;
+ handle_t *handle = NULL;
+- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
++ struct inode *inode = file_inode(vma->vm_file);
++ struct super_block *sb = inode->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
++ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+- }
++ } else
++ down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+@@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
++ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+- }
++ } else
++ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return result;
+ }
+@@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
++ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+- }
++ } else
++ down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+@@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
++ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+- }
++ } else
++ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return result;
+ }
+
+ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ {
+- return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+- ext4_end_io_unwritten);
++ int err;
++ struct inode *inode = file_inode(vma->vm_file);
++
++ sb_start_pagefault(inode->i_sb);
++ file_update_time(vma->vm_file);
++ down_read(&EXT4_I(inode)->i_mmap_sem);
++ err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
++ ext4_end_io_unwritten);
++ up_read(&EXT4_I(inode)->i_mmap_sem);
++ sb_end_pagefault(inode->i_sb);
++
++ return err;
++}
++
++/*
++ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
++ * handler we check for races agaist truncate. Note that since we cycle through
++ * i_mmap_sem, we are sure that also any hole punching that began before we
++ * were called is finished by now and so if it included part of the file we
++ * are working on, our pte will get unmapped and the check for pte_same() in
++ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
++ * desired.
++ */
++static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
++ struct vm_fault *vmf)
++{
++ struct inode *inode = file_inode(vma->vm_file);
++ struct super_block *sb = inode->i_sb;
++ int ret = VM_FAULT_NOPAGE;
++ loff_t size;
++
++ sb_start_pagefault(sb);
++ file_update_time(vma->vm_file);
++ down_read(&EXT4_I(inode)->i_mmap_sem);
++ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ if (vmf->pgoff >= size)
++ ret = VM_FAULT_SIGBUS;
++ up_read(&EXT4_I(inode)->i_mmap_sem);
++ sb_end_pagefault(sb);
++
++ return ret;
+ }
+
+ static const struct vm_operations_struct ext4_dax_vm_ops = {
+ .fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
+ .page_mkwrite = ext4_dax_mkwrite,
+- .pfn_mkwrite = dax_pfn_mkwrite,
++ .pfn_mkwrite = ext4_dax_pfn_mkwrite,
+ };
+ #else
+ #define ext4_dax_vm_ops ext4_file_vm_ops
+ #endif
+
+ static const struct vm_operations_struct ext4_file_vm_ops = {
+- .fault = filemap_fault,
++ .fault = ext4_filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = ext4_page_mkwrite,
+ };
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index ea433a7f4bca..d1207d03c961 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+
+ }
+
++ /* Wait all existing dio workers, newcomers will block on i_mutex */
++ ext4_inode_block_unlocked_dio(inode);
++ inode_dio_wait(inode);
++
++ /*
++ * Prevent page faults from reinstantiating pages we have released from
++ * page cache.
++ */
++ down_write(&EXT4_I(inode)->i_mmap_sem);
+ first_block_offset = round_up(offset, sb->s_blocksize);
+ last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+@@ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
+- /* Wait all existing dio workers, newcomers will block on i_mutex */
+- ext4_inode_block_unlocked_dio(inode);
+- inode_dio_wait(inode);
+-
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ credits = ext4_writepage_trans_blocks(inode);
+ else
+@@ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+- /* Now release the pages again to reduce race window */
+- if (last_block_offset > first_block_offset)
+- truncate_pagecache_range(inode, first_block_offset,
+- last_block_offset);
+-
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ out_stop:
+ ext4_journal_stop(handle);
+ out_dio:
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ ext4_inode_resume_unlocked_dio(inode);
+ out_mutex:
+ mutex_unlock(&inode->i_mutex);
+@@ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+ } else
+ ext4_wait_for_tail_page_commit(inode);
+ }
++ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+@@ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+ truncate_pagecache(inode, inode->i_size);
+ if (shrink)
+ ext4_truncate(inode);
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ }
+
+ if (!rc) {
+@@ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
++
++ down_read(&EXT4_I(inode)->i_mmap_sem);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+@@ -5347,6 +5352,19 @@ retry_alloc:
+ out_ret:
+ ret = block_page_mkwrite_return(ret);
+ out:
++ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+ }
++
++int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ struct inode *inode = file_inode(vma->vm_file);
++ int err;
++
++ down_read(&EXT4_I(inode)->i_mmap_sem);
++ err = filemap_fault(vma, vmf);
++ up_read(&EXT4_I(inode)->i_mmap_sem);
++
++ return err;
++}
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index c9ab67da6e5a..493370e6590e 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -958,6 +958,7 @@ static void init_once(void *foo)
+ INIT_LIST_HEAD(&ei->i_orphan);
+ init_rwsem(&ei->xattr_sem);
+ init_rwsem(&ei->i_data_sem);
++ init_rwsem(&ei->i_mmap_sem);
+ inode_init_once(&ei->vfs_inode);
+ }
+
+diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
+index 011ba6670d99..c70d06a383e2 100644
+--- a/fs/ext4/truncate.h
++++ b/fs/ext4/truncate.h
+@@ -10,8 +10,10 @@
+ */
+ static inline void ext4_truncate_failed_write(struct inode *inode)
+ {
++ down_write(&EXT4_I(inode)->i_mmap_sem);
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
++ up_write(&EXT4_I(inode)->i_mmap_sem);
+ }
+
+ /*
+--
+2.5.5
+