Skip to content

Commit 109811c

Browse files
jankaratytso
authored andcommitted
ext4: simplify io_end handling for AIO DIO
When mapping blocks for direct IO, we allocate io_end structure before mapping blocks and store pointer to it in the inode. This creates a requirement that any AIO DIO using io_end must be protected by i_mutex. This created problems in the past with dioread_nolock mode which was corrupting io_end pointers. Also io_end is allocated unnecessarily in case where we don't need to convert any extents (which is a common case for example when overwriting file). We fix the problem by allocating io_end only once we return unwritten extent from block mapping function for AIO DIO (so we can save some pointless io_end allocations) and we pass pointer to it in bh->b_private which generic DIO code later passes to our end IO callback. That way we remove any need for global pointer to io_end structure and thus fix the races. The downside of this change is that the checking for unwritten IO in flight in ext4_extents_can_be_merged() is more racy since we now increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after dropping i_data_sem. However the check has been racy already before because ext4_writepages() already increment i_unwritten after dropping i_data_sem and reserved blocks save us from hitting ENOSPC in the worst case. Signed-off-by: Jan Kara <[email protected]>
1 parent efe70c2 commit 109811c

File tree

3 files changed

+74
-104
lines changed

3 files changed

+74
-104
lines changed

fs/ext4/ext4.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
15131513
}
15141514
}
15151515

1516-
static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
1517-
{
1518-
return inode->i_private;
1519-
}
1520-
1521-
static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
1522-
{
1523-
inode->i_private = io;
1524-
}
1525-
15261516
/*
15271517
* Inode dynamic state flags
15281518
*/

fs/ext4/extents.c

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
17361736
*/
17371737
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
17381738
return 0;
1739+
/*
1740+
* The check for IO to unwritten extent is somewhat racy as we
1741+
* increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1742+
* dropping i_data_sem. But reserved blocks should save us in that
1743+
* case.
1744+
*/
17391745
if (ext4_ext_is_unwritten(ex1) &&
17401746
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
17411747
atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
40074013
struct ext4_ext_path *path = *ppath;
40084014
int ret = 0;
40094015
int err = 0;
4010-
ext4_io_end_t *io = ext4_inode_aio(inode);
40114016

40124017
ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
40134018
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
40304035
flags | EXT4_GET_BLOCKS_CONVERT);
40314036
if (ret <= 0)
40324037
goto out;
4033-
/*
4034-
* Flag the inode(non aio case) or end_io struct (aio case)
4035-
* that this IO needs to conversion to written when IO is
4036-
* completed
4037-
*/
4038-
if (io)
4039-
ext4_set_io_unwritten_flag(inode, io);
4040-
else
4041-
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
40424038
map->m_flags |= EXT4_MAP_UNWRITTEN;
40434039
goto out;
40444040
}
@@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
42834279
unsigned int allocated = 0, offset = 0;
42844280
unsigned int allocated_clusters = 0;
42854281
struct ext4_allocation_request ar;
4286-
ext4_io_end_t *io = ext4_inode_aio(inode);
42874282
ext4_lblk_t cluster_offset;
4288-
int set_unwritten = 0;
42894283
bool map_from_cluster = false;
42904284

42914285
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4482,15 +4476,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
44824476
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
44834477
ext4_ext_mark_unwritten(&newex);
44844478
map->m_flags |= EXT4_MAP_UNWRITTEN;
4485-
/*
4486-
* io_end structure was created for every IO write to an
4487-
* unwritten extent. To avoid unnecessary conversion,
4488-
* here we flag the IO that really needs the conversion.
4489-
* For non asycn direct IO case, flag the inode state
4490-
* that we need to perform conversion when IO is done.
4491-
*/
4492-
if (flags & EXT4_GET_BLOCKS_PRE_IO)
4493-
set_unwritten = 1;
44944479
}
44954480

44964481
err = 0;
@@ -4501,14 +4486,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
45014486
err = ext4_ext_insert_extent(handle, inode, &path,
45024487
&newex, flags);
45034488

4504-
if (!err && set_unwritten) {
4505-
if (io)
4506-
ext4_set_io_unwritten_flag(inode, io);
4507-
else
4508-
ext4_set_inode_state(inode,
4509-
EXT4_STATE_DIO_UNWRITTEN);
4510-
}
4511-
45124489
if (err && free_on_err) {
45134490
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
45144491
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;

fs/ext4/inode.c

Lines changed: 68 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -797,18 +797,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
797797
}
798798

799799
/*
800-
* Get block function for DIO writes when we create unwritten extent if
800+
* Get block function for AIO DIO writes when we create unwritten extent if
801801
* blocks are not allocated yet. The extent will be converted to written
802802
* after IO is complete.
803803
*/
804-
static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock,
805-
struct buffer_head *bh_result, int create)
804+
static int ext4_dio_get_block_unwritten_async(struct inode *inode,
805+
sector_t iblock, struct buffer_head *bh_result, int create)
806806
{
807807
handle_t *handle;
808808
int ret;
809809

810-
ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n",
811-
inode->i_ino, create);
812810
/* We don't expect handle for direct IO */
813811
WARN_ON_ONCE(ext4_journal_current_handle());
814812

@@ -818,16 +816,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock,
818816
ret = _ext4_get_block(inode, iblock, bh_result,
819817
EXT4_GET_BLOCKS_IO_CREATE_EXT);
820818
ext4_journal_stop(handle);
821-
if (!ret && buffer_unwritten(bh_result)) {
822-
ext4_io_end_t *io_end = ext4_inode_aio(inode);
823819

820+
/*
821+
* When doing DIO using unwritten extents, we need io_end to convert
822+
* unwritten extents to written on IO completion. We allocate io_end
823+
* once we spot unwritten extent and store it in b_private. Generic
824+
* DIO code keeps b_private set and furthermore passes the value to
825+
* our completion callback in 'private' argument.
826+
*/
827+
if (!ret && buffer_unwritten(bh_result)) {
828+
if (!bh_result->b_private) {
829+
ext4_io_end_t *io_end;
830+
831+
io_end = ext4_init_io_end(inode, GFP_KERNEL);
832+
if (!io_end)
833+
return -ENOMEM;
834+
bh_result->b_private = io_end;
835+
ext4_set_io_unwritten_flag(inode, io_end);
836+
}
824837
set_buffer_defer_completion(bh_result);
825-
WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN));
826838
}
827839

828840
return ret;
829841
}
830842

843+
/*
844+
* Get block function for non-AIO DIO writes when we create unwritten extent if
845+
* blocks are not allocated yet. The extent will be converted to written
846+
* after IO is complete from ext4_ext_direct_IO() function.
847+
*/
848+
static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
849+
sector_t iblock, struct buffer_head *bh_result, int create)
850+
{
851+
handle_t *handle;
852+
int ret;
853+
854+
/* We don't expect handle for direct IO */
855+
WARN_ON_ONCE(ext4_journal_current_handle());
856+
857+
handle = start_dio_trans(inode, bh_result);
858+
if (IS_ERR(handle))
859+
return PTR_ERR(handle);
860+
ret = _ext4_get_block(inode, iblock, bh_result,
861+
EXT4_GET_BLOCKS_IO_CREATE_EXT);
862+
ext4_journal_stop(handle);
863+
864+
/*
865+
* Mark inode as having pending DIO writes to unwritten extents.
866+
* ext4_ext_direct_IO() checks this flag and converts extents to
867+
* written.
868+
*/
869+
if (!ret && buffer_unwritten(bh_result))
870+
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
871+
872+
return ret;
873+
}
874+
831875
static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
832876
struct buffer_head *bh_result, int create)
833877
{
@@ -3243,18 +3287,16 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
32433287
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
32443288
ssize_t size, void *private)
32453289
{
3246-
ext4_io_end_t *io_end = iocb->private;
3290+
ext4_io_end_t *io_end = private;
32473291

32483292
/* if not async direct IO just return */
32493293
if (!io_end)
32503294
return;
32513295

32523296
ext_debug("ext4_end_io_dio(): io_end 0x%p "
32533297
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3254-
iocb->private, io_end->inode->i_ino, iocb, offset,
3255-
size);
3298+
io_end, io_end->inode->i_ino, iocb, offset, size);
32563299

3257-
iocb->private = NULL;
32583300
io_end->offset = offset;
32593301
io_end->size = size;
32603302
ext4_put_io_end(io_end);
@@ -3290,7 +3332,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
32903332
get_block_t *get_block_func = NULL;
32913333
int dio_flags = 0;
32923334
loff_t final_size = offset + count;
3293-
ext4_io_end_t *io_end = NULL;
32943335

32953336
/* Use the old path for reads and writes beyond i_size. */
32963337
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3315,47 +3356,31 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33153356
/*
33163357
* We could direct write to holes and fallocate.
33173358
*
3318-
* Allocated blocks to fill the hole are marked as
3319-
* unwritten to prevent parallel buffered read to expose
3320-
* the stale data before DIO complete the data IO.
3359+
* Allocated blocks to fill the hole are marked as unwritten to prevent
3360+
* parallel buffered read to expose the stale data before DIO complete
3361+
* the data IO.
33213362
*
3322-
* As to previously fallocated extents, ext4 get_block will
3323-
* just simply mark the buffer mapped but still keep the
3324-
* extents unwritten.
3363+
* As to previously fallocated extents, ext4 get_block will just simply
3364+
* mark the buffer mapped but still keep the extents unwritten.
33253365
*
3326-
* For non AIO case, we will convert those unwritten extents
3327-
* to written after return back from blockdev_direct_IO.
3366+
* For non AIO case, we will convert those unwritten extents to written
3367+
* after return back from blockdev_direct_IO. That way we save us from
3368+
* allocating io_end structure and also the overhead of offloading
3369+
* the extent convertion to a workqueue.
33283370
*
33293371
* For async DIO, the conversion needs to be deferred when the
33303372
* IO is completed. The ext4 end_io callback function will be
33313373
* called to take care of the conversion work. Here for async
33323374
* case, we allocate an io_end structure to hook to the iocb.
33333375
*/
33343376
iocb->private = NULL;
3335-
if (overwrite) {
3377+
if (overwrite)
33363378
get_block_func = ext4_dio_get_block_overwrite;
3379+
else if (is_sync_kiocb(iocb)) {
3380+
get_block_func = ext4_dio_get_block_unwritten_sync;
3381+
dio_flags = DIO_LOCKING;
33373382
} else {
3338-
ext4_inode_aio_set(inode, NULL);
3339-
if (!is_sync_kiocb(iocb)) {
3340-
io_end = ext4_init_io_end(inode, GFP_NOFS);
3341-
if (!io_end) {
3342-
ret = -ENOMEM;
3343-
goto retake_lock;
3344-
}
3345-
/*
3346-
* Grab reference for DIO. Will be dropped in
3347-
* ext4_end_io_dio()
3348-
*/
3349-
iocb->private = ext4_get_io_end(io_end);
3350-
/*
3351-
* we save the io structure for current async direct
3352-
* IO, so that later ext4_map_blocks() could flag the
3353-
* io structure whether there is a unwritten extents
3354-
* needs to be converted when IO is completed.
3355-
*/
3356-
ext4_inode_aio_set(inode, io_end);
3357-
}
3358-
get_block_func = ext4_dio_get_block_unwritten;
3383+
get_block_func = ext4_dio_get_block_unwritten_async;
33593384
dio_flags = DIO_LOCKING;
33603385
}
33613386
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3370,27 +3395,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
33703395
get_block_func,
33713396
ext4_end_io_dio, NULL, dio_flags);
33723397

3373-
/*
3374-
* Put our reference to io_end. This can free the io_end structure e.g.
3375-
* in sync IO case or in case of error. It can even perform extent
3376-
* conversion if all bios we submitted finished before we got here.
3377-
* Note that in that case iocb->private can be already set to NULL
3378-
* here.
3379-
*/
3380-
if (io_end) {
3381-
ext4_inode_aio_set(inode, NULL);
3382-
ext4_put_io_end(io_end);
3383-
/*
3384-
* When no IO was submitted ext4_end_io_dio() was not
3385-
* called so we have to put iocb's reference.
3386-
*/
3387-
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3388-
WARN_ON(iocb->private != io_end);
3389-
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3390-
ext4_put_io_end(io_end);
3391-
iocb->private = NULL;
3392-
}
3393-
}
33943398
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
33953399
EXT4_STATE_DIO_UNWRITTEN)) {
33963400
int err;
@@ -3405,7 +3409,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
34053409
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
34063410
}
34073411

3408-
retake_lock:
34093412
if (iov_iter_rw(iter) == WRITE)
34103413
inode_dio_end(inode);
34113414
/* take i_mutex locking again if we do a ovewrite dio */

0 commit comments

Comments
 (0)