Skip to content

Commit 248766f

Browse files
Jan Karasashalevin
authored andcommitted
ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <[email protected]> Signed-off-by: Theodore Ts'o <[email protected]> Reviewed-by: Mingming Cao <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 14b4d14 commit 248766f

File tree

6 files changed

+61
-30
lines changed

6 files changed

+61
-30
lines changed

fs/ext4/ext4.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,15 @@ struct ext4_inode_info {
873873
* by other means, so we have i_data_sem.
874874
*/
875875
struct rw_semaphore i_data_sem;
876+
/*
877+
* i_mmap_sem is for serializing page faults with truncate / punch hole
878+
* operations. We have to make sure that new page cannot be faulted in
879+
* a section of the inode that is being punched. We cannot easily use
880+
* i_data_sem for this since we need protection for the whole punch
881+
* operation and i_data_sem ranks below transaction start so we have
882+
* to occasionally drop it.
883+
*/
884+
struct rw_semaphore i_mmap_sem;
876885
struct inode vfs_inode;
877886
struct jbd2_inode *jinode;
878887

@@ -2287,6 +2296,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
22872296
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
22882297
loff_t lstart, loff_t lend);
22892298
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2299+
extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
22902300
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
22912301
extern void ext4_da_update_reserve_space(struct inode *inode,
22922302
int used, int quota_claim);

fs/ext4/extents.c

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4741,7 +4741,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
47414741
int partial_begin, partial_end;
47424742
loff_t start, end;
47434743
ext4_lblk_t lblk;
4744-
struct address_space *mapping = inode->i_mapping;
47454744
unsigned int blkbits = inode->i_blkbits;
47464745

47474746
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4756,17 +4755,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
47564755
return ret;
47574756
}
47584757

4759-
/*
4760-
* Write out all dirty pages to avoid race conditions
4761-
* Then release them.
4762-
*/
4763-
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4764-
ret = filemap_write_and_wait_range(mapping, offset,
4765-
offset + len - 1);
4766-
if (ret)
4767-
return ret;
4768-
}
4769-
47704758
/*
47714759
* Round up offset. This is not fallocate, we neet to zero out
47724760
* blocks, so convert interior block aligned part of the range to
@@ -4827,16 +4815,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
48274815
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
48284816
EXT4_EX_NOCACHE);
48294817

4830-
/* Now release the pages and zero block aligned part of pages*/
4831-
truncate_pagecache_range(inode, start, end - 1);
4832-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4833-
48344818
/* Wait all existing dio workers, newcomers will block on i_mutex */
48354819
ext4_inode_block_unlocked_dio(inode);
48364820
inode_dio_wait(inode);
48374821

4822+
/*
4823+
* Prevent page faults from reinstantiating pages we have
4824+
* released from page cache.
4825+
*/
4826+
down_write(&EXT4_I(inode)->i_mmap_sem);
4827+
/* Now release the pages and zero block aligned part of pages */
4828+
truncate_pagecache_range(inode, start, end - 1);
4829+
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4830+
48384831
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
48394832
flags, mode);
4833+
up_write(&EXT4_I(inode)->i_mmap_sem);
48404834
if (ret)
48414835
goto out_dio;
48424836
}
@@ -5454,17 +5448,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
54545448
goto out_mutex;
54555449
}
54565450

5457-
truncate_pagecache(inode, ioffset);
5458-
54595451
/* Wait for existing dio to complete */
54605452
ext4_inode_block_unlocked_dio(inode);
54615453
inode_dio_wait(inode);
54625454

5455+
/*
5456+
* Prevent page faults from reinstantiating pages we have released from
5457+
* page cache.
5458+
*/
5459+
down_write(&EXT4_I(inode)->i_mmap_sem);
5460+
truncate_pagecache(inode, ioffset);
5461+
54635462
credits = ext4_writepage_trans_blocks(inode);
54645463
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
54655464
if (IS_ERR(handle)) {
54665465
ret = PTR_ERR(handle);
5467-
goto out_dio;
5466+
goto out_mmap;
54685467
}
54695468

54705469
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5503,7 +5502,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
55035502

55045503
out_stop:
55055504
ext4_journal_stop(handle);
5506-
out_dio:
5505+
out_mmap:
5506+
up_write(&EXT4_I(inode)->i_mmap_sem);
55075507
ext4_inode_resume_unlocked_dio(inode);
55085508
out_mutex:
55095509
mutex_unlock(&inode->i_mutex);

fs/ext4/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
213213
#endif
214214

215215
static const struct vm_operations_struct ext4_file_vm_ops = {
216-
.fault = filemap_fault,
216+
.fault = ext4_filemap_fault,
217217
.map_pages = filemap_map_pages,
218218
.page_mkwrite = ext4_page_mkwrite,
219219
};

fs/ext4/inode.c

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3588,6 +3588,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
35883588

35893589
}
35903590

3591+
/* Wait all existing dio workers, newcomers will block on i_mutex */
3592+
ext4_inode_block_unlocked_dio(inode);
3593+
inode_dio_wait(inode);
3594+
3595+
/*
3596+
* Prevent page faults from reinstantiating pages we have released from
3597+
* page cache.
3598+
*/
3599+
down_write(&EXT4_I(inode)->i_mmap_sem);
35913600
first_block_offset = round_up(offset, sb->s_blocksize);
35923601
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
35933602

@@ -3596,10 +3605,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
35963605
truncate_pagecache_range(inode, first_block_offset,
35973606
last_block_offset);
35983607

3599-
/* Wait all existing dio workers, newcomers will block on i_mutex */
3600-
ext4_inode_block_unlocked_dio(inode);
3601-
inode_dio_wait(inode);
3602-
36033608
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
36043609
credits = ext4_writepage_trans_blocks(inode);
36053610
else
@@ -3645,16 +3650,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
36453650
if (IS_SYNC(inode))
36463651
ext4_handle_sync(handle);
36473652

3648-
/* Now release the pages again to reduce race window */
3649-
if (last_block_offset > first_block_offset)
3650-
truncate_pagecache_range(inode, first_block_offset,
3651-
last_block_offset);
3652-
36533653
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
36543654
ext4_mark_inode_dirty(handle, inode);
36553655
out_stop:
36563656
ext4_journal_stop(handle);
36573657
out_dio:
3658+
up_write(&EXT4_I(inode)->i_mmap_sem);
36583659
ext4_inode_resume_unlocked_dio(inode);
36593660
out_mutex:
36603661
mutex_unlock(&inode->i_mutex);
@@ -4775,11 +4776,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
47754776
} else
47764777
ext4_wait_for_tail_page_commit(inode);
47774778
}
4779+
down_write(&EXT4_I(inode)->i_mmap_sem);
47784780
/*
47794781
* Truncate pagecache after we've waited for commit
47804782
* in data=journal mode to make pages freeable.
47814783
*/
47824784
truncate_pagecache(inode, inode->i_size);
4785+
up_write(&EXT4_I(inode)->i_mmap_sem);
47834786
}
47844787
/*
47854788
* We want to call ext4_truncate() even if attr->ia_size ==
@@ -5234,6 +5237,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
52345237

52355238
sb_start_pagefault(inode->i_sb);
52365239
file_update_time(vma->vm_file);
5240+
5241+
down_read(&EXT4_I(inode)->i_mmap_sem);
52375242
/* Delalloc case is easy... */
52385243
if (test_opt(inode->i_sb, DELALLOC) &&
52395244
!ext4_should_journal_data(inode) &&
@@ -5303,6 +5308,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
53035308
out_ret:
53045309
ret = block_page_mkwrite_return(ret);
53055310
out:
5311+
up_read(&EXT4_I(inode)->i_mmap_sem);
53065312
sb_end_pagefault(inode->i_sb);
53075313
return ret;
53085314
}
5315+
5316+
int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5317+
{
5318+
struct inode *inode = file_inode(vma->vm_file);
5319+
int err;
5320+
5321+
down_read(&EXT4_I(inode)->i_mmap_sem);
5322+
err = filemap_fault(vma, vmf);
5323+
up_read(&EXT4_I(inode)->i_mmap_sem);
5324+
5325+
return err;
5326+
}

fs/ext4/super.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -945,6 +945,7 @@ static void init_once(void *foo)
945945
INIT_LIST_HEAD(&ei->i_orphan);
946946
init_rwsem(&ei->xattr_sem);
947947
init_rwsem(&ei->i_data_sem);
948+
init_rwsem(&ei->i_mmap_sem);
948949
inode_init_once(&ei->vfs_inode);
949950
}
950951

fs/ext4/truncate.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
*/
1111
static inline void ext4_truncate_failed_write(struct inode *inode)
1212
{
13+
down_write(&EXT4_I(inode)->i_mmap_sem);
1314
truncate_inode_pages(inode->i_mapping, inode->i_size);
1415
ext4_truncate(inode);
16+
up_write(&EXT4_I(inode)->i_mmap_sem);
1517
}
1618

1719
/*

0 commit comments

Comments
 (0)