Skip to content

Commit 8184620

Browse files
fdmananakdave
authored andcommitted
btrfs: fix lost file sync on direct IO write with nowait and dsync iocb
When doing a direct IO write using a iocb with nowait and dsync set, we end up not syncing the file once the write completes. This is because we tell iomap to not call generic_write_sync(), which would result in calling btrfs_sync_file(), in order to avoid a deadlock since iomap can call it while we are holding the inode's lock and btrfs_sync_file() needs to acquire the inode's lock. The deadlock happens only if the write happens synchronously, when iomap_dio_rw() calls iomap_dio_complete() before it returns. Instead we do the sync ourselves at btrfs_do_write_iter(). For a nowait write however we can end up not doing the sync ourselves at at btrfs_do_write_iter() because the write could have been queued, and therefore we get -EIOCBQUEUED returned from iomap in such case. That makes us skip the sync call at btrfs_do_write_iter(), as we don't do it for any error returned from btrfs_direct_write(). We can't simply do the call even if -EIOCBQUEUED is returned, since that would block the task waiting for IO, both for the data since there are bios still in progress as well as potentially blocking when joining a log transaction and when syncing the log (writing log trees, super blocks, etc). So let iomap do the sync call itself and in order to avoid deadlocks for the case of synchronous writes (without nowait), use __iomap_dio_rw() and have ourselves call iomap_dio_complete() after unlocking the inode. A test case will later be sent for fstests, after this is fixed in Linus' tree. Fixes: 51bd956 ("btrfs: fix deadlock due to page faults during direct IO reads and writes") Reported-by: Марк Коренберг <[email protected]> Link: https://lore.kernel.org/linux-btrfs/CAEmTpZGRKbzc16fWPvxbr6AfFsQoLmz-Lcg-7OgJOZDboJ+SGQ@mail.gmail.com/ CC: [email protected] # 6.0+ Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 063b1f2 commit 8184620

File tree

3 files changed

+31
-10
lines changed

3 files changed

+31
-10
lines changed

fs/btrfs/ctree.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3462,7 +3462,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
34623462
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
34633463
const struct btrfs_ioctl_encoded_io_args *encoded);
34643464

3465-
ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
3465+
ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
3466+
size_t done_before);
3467+
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
3468+
size_t done_before);
34663469

34673470
extern const struct dentry_operations btrfs_dentry_operations;
34683471

fs/btrfs/file.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,6 +1765,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
17651765
loff_t endbyte;
17661766
ssize_t err;
17671767
unsigned int ilock_flags = 0;
1768+
struct iomap_dio *dio;
17681769

17691770
if (iocb->ki_flags & IOCB_NOWAIT)
17701771
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1825,11 +1826,22 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
18251826
* So here we disable page faults in the iov_iter and then retry if we
18261827
* got -EFAULT, faulting in the pages before the retry.
18271828
*/
1828-
again:
18291829
from->nofault = true;
1830-
err = btrfs_dio_rw(iocb, from, written);
1830+
dio = btrfs_dio_write(iocb, from, written);
18311831
from->nofault = false;
18321832

1833+
/*
1834+
* iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1835+
* iocb, and that needs to lock the inode. So unlock it before calling
1836+
* iomap_dio_complete() to avoid a deadlock.
1837+
*/
1838+
btrfs_inode_unlock(inode, ilock_flags);
1839+
1840+
if (IS_ERR_OR_NULL(dio))
1841+
err = PTR_ERR_OR_ZERO(dio);
1842+
else
1843+
err = iomap_dio_complete(dio);
1844+
18331845
/* No increment (+=) because iomap returns a cumulative value. */
18341846
if (err > 0)
18351847
written = err;
@@ -1855,12 +1867,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
18551867
} else {
18561868
fault_in_iov_iter_readable(from, left);
18571869
prev_left = left;
1858-
goto again;
1870+
goto relock;
18591871
}
18601872
}
18611873

1862-
btrfs_inode_unlock(inode, ilock_flags);
1863-
18641874
/*
18651875
* If 'err' is -ENOTBLK or we have not written all data, then it means
18661876
* we must fallback to buffered IO.
@@ -4035,7 +4045,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
40354045
*/
40364046
pagefault_disable();
40374047
to->nofault = true;
4038-
ret = btrfs_dio_rw(iocb, to, read);
4048+
ret = btrfs_dio_read(iocb, to, read);
40394049
to->nofault = false;
40404050
pagefault_enable();
40414051

fs/btrfs/inode.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8078,13 +8078,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
80788078
.bio_set = &btrfs_dio_bioset,
80798079
};
80808080

8081-
ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
8081+
ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
80828082
{
80838083
struct btrfs_dio_data data;
80848084

80858085
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
8086-
IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
8087-
&data, done_before);
8086+
IOMAP_DIO_PARTIAL, &data, done_before);
8087+
}
8088+
8089+
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
8090+
size_t done_before)
8091+
{
8092+
struct btrfs_dio_data data;
8093+
8094+
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
8095+
IOMAP_DIO_PARTIAL, &data, done_before);
80888096
}
80898097

80908098
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,

0 commit comments

Comments
 (0)