Skip to content

Commit e6fff81

Browse files
committed
xfs: proper replay of deferred ops queued during log recovery
When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 5099558 ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <[email protected]> Reviewed-by: Brian Foster <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]>
1 parent 901219b commit e6fff81

File tree

8 files changed

+187
-91
lines changed

8 files changed

+187
-91
lines changed

fs/xfs/libxfs/xfs_defer.c

Lines changed: 82 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -549,14 +549,89 @@ xfs_defer_move(
549549
*
550550
* Create and log intent items for all the work that we're capturing so that we
551551
* can be assured that the items will get replayed if the system goes down
552-
* before log recovery gets a chance to finish the work it put off. Then we
553-
* move the chain from stp to dtp.
552+
* before log recovery gets a chance to finish the work it put off. The entire
553+
* deferred ops state is transferred to the capture structure and the
554+
* transaction is then ready for the caller to commit it. If there are no
555+
* intent items to capture, this function returns NULL.
554556
*/
557+
static struct xfs_defer_capture *
558+
xfs_defer_ops_capture(
559+
struct xfs_trans *tp)
560+
{
561+
struct xfs_defer_capture *dfc;
562+
563+
if (list_empty(&tp->t_dfops))
564+
return NULL;
565+
566+
/* Create an object to capture the defer ops. */
567+
dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
568+
INIT_LIST_HEAD(&dfc->dfc_list);
569+
INIT_LIST_HEAD(&dfc->dfc_dfops);
570+
571+
xfs_defer_create_intents(tp);
572+
573+
/* Move the dfops chain and transaction state to the capture struct. */
574+
list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
575+
dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
576+
tp->t_flags &= ~XFS_TRANS_LOWMODE;
577+
578+
return dfc;
579+
}
580+
581+
/* Release all resources that we used to capture deferred ops. */
555582
void
556-
xfs_defer_capture(
557-
struct xfs_trans *dtp,
558-
struct xfs_trans *stp)
583+
xfs_defer_ops_release(
584+
struct xfs_mount *mp,
585+
struct xfs_defer_capture *dfc)
586+
{
587+
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
588+
kmem_free(dfc);
589+
}
590+
591+
/*
592+
* Capture any deferred ops and commit the transaction. This is the last step
593+
* needed to finish a log intent item that we recovered from the log.
594+
*/
595+
int
596+
xfs_defer_ops_capture_and_commit(
597+
struct xfs_trans *tp,
598+
struct list_head *capture_list)
599+
{
600+
struct xfs_mount *mp = tp->t_mountp;
601+
struct xfs_defer_capture *dfc;
602+
int error;
603+
604+
/* If we don't capture anything, commit transaction and exit. */
605+
dfc = xfs_defer_ops_capture(tp);
606+
if (!dfc)
607+
return xfs_trans_commit(tp);
608+
609+
/* Commit the transaction and add the capture structure to the list. */
610+
error = xfs_trans_commit(tp);
611+
if (error) {
612+
xfs_defer_ops_release(mp, dfc);
613+
return error;
614+
}
615+
616+
list_add_tail(&dfc->dfc_list, capture_list);
617+
return 0;
618+
}
619+
620+
/*
621+
* Attach a chain of captured deferred ops to a new transaction and free the
622+
* capture structure.
623+
*/
624+
void
625+
xfs_defer_ops_continue(
626+
struct xfs_defer_capture *dfc,
627+
struct xfs_trans *tp)
559628
{
560-
xfs_defer_create_intents(stp);
561-
xfs_defer_move(dtp, stp);
629+
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
630+
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
631+
632+
/* Move captured dfops chain and state to the transaction. */
633+
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
634+
tp->t_flags |= dfc->dfc_tpflags;
635+
636+
kmem_free(dfc);
562637
}

fs/xfs/libxfs/xfs_defer.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
struct xfs_btree_cur;
1010
struct xfs_defer_op_type;
11+
struct xfs_defer_capture;
1112

1213
/*
1314
* Header for deferred operation list.
@@ -63,10 +64,26 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
6364
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
6465
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
6566

67+
/*
68+
* This structure enables a dfops user to detach the chain of deferred
69+
* operations from a transaction so that they can be continued later.
70+
*/
71+
struct xfs_defer_capture {
72+
/* List of other capture structures. */
73+
struct list_head dfc_list;
74+
75+
/* Deferred ops state saved from the transaction. */
76+
struct list_head dfc_dfops;
77+
unsigned int dfc_tpflags;
78+
};
79+
6680
/*
6781
* Functions to capture a chain of deferred operations and continue them later.
6882
* This doesn't normally happen except log recovery.
6983
*/
70-
void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp);
84+
int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
85+
struct list_head *capture_list);
86+
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
87+
void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
7188

7289
#endif /* __XFS_DEFER_H__ */

fs/xfs/xfs_bmap_item.c

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -424,13 +424,13 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
424424
STATIC int
425425
xfs_bui_item_recover(
426426
struct xfs_log_item *lip,
427-
struct xfs_trans *parent_tp)
427+
struct list_head *capture_list)
428428
{
429429
struct xfs_bmbt_irec irec;
430430
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
431431
struct xfs_trans *tp;
432432
struct xfs_inode *ip = NULL;
433-
struct xfs_mount *mp = parent_tp->t_mountp;
433+
struct xfs_mount *mp = lip->li_mountp;
434434
struct xfs_map_extent *bmap;
435435
struct xfs_bud_log_item *budp;
436436
xfs_fsblock_t startblock_fsb;
@@ -478,12 +478,7 @@ xfs_bui_item_recover(
478478
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
479479
if (error)
480480
return error;
481-
/*
482-
* Recovery stashes all deferred ops during intent processing and
483-
* finishes them on completion. Transfer current dfops state to this
484-
* transaction and transfer the result back before we return.
485-
*/
486-
xfs_defer_move(tp, parent_tp);
481+
487482
budp = xfs_trans_get_bud(tp, buip);
488483

489484
/* Grab the inode. */
@@ -531,15 +526,12 @@ xfs_bui_item_recover(
531526
xfs_bmap_unmap_extent(tp, ip, &irec);
532527
}
533528

534-
xfs_defer_capture(parent_tp, tp);
535-
error = xfs_trans_commit(tp);
529+
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
536530
xfs_iunlock(ip, XFS_ILOCK_EXCL);
537531
xfs_irele(ip);
538-
539532
return error;
540533

541534
err_inode:
542-
xfs_defer_move(parent_tp, tp);
543535
xfs_trans_cancel(tp);
544536
if (ip) {
545537
xfs_iunlock(ip, XFS_ILOCK_EXCL);

fs/xfs/xfs_extfree_item.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
585585
STATIC int
586586
xfs_efi_item_recover(
587587
struct xfs_log_item *lip,
588-
struct xfs_trans *parent_tp)
588+
struct list_head *capture_list)
589589
{
590590
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
591-
struct xfs_mount *mp = parent_tp->t_mountp;
591+
struct xfs_mount *mp = lip->li_mountp;
592592
struct xfs_efd_log_item *efdp;
593593
struct xfs_trans *tp;
594594
struct xfs_extent *extp;
@@ -627,8 +627,7 @@ xfs_efi_item_recover(
627627

628628
}
629629

630-
error = xfs_trans_commit(tp);
631-
return error;
630+
return xfs_defer_ops_capture_and_commit(tp, capture_list);
632631

633632
abort_error:
634633
xfs_trans_cancel(tp);

fs/xfs/xfs_log_recover.c

Lines changed: 71 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2434,37 +2434,65 @@ xlog_recover_process_data(
24342434
/* Take all the collected deferred ops and finish them in order. */
24352435
static int
24362436
xlog_finish_defer_ops(
2437-
struct xfs_trans *parent_tp)
2437+
struct xfs_mount *mp,
2438+
struct list_head *capture_list)
24382439
{
2439-
struct xfs_mount *mp = parent_tp->t_mountp;
2440+
struct xfs_defer_capture *dfc, *next;
24402441
struct xfs_trans *tp;
24412442
int64_t freeblks;
2442-
uint resblks;
2443-
int error;
2443+
uint64_t resblks;
2444+
int error = 0;
24442445

2445-
/*
2446-
* We're finishing the defer_ops that accumulated as a result of
2447-
* recovering unfinished intent items during log recovery. We
2448-
* reserve an itruncate transaction because it is the largest
2449-
* permanent transaction type. Since we're the only user of the fs
2450-
* right now, take 93% (15/16) of the available free blocks. Use
2451-
* weird math to avoid a 64-bit division.
2452-
*/
2453-
freeblks = percpu_counter_sum(&mp->m_fdblocks);
2454-
if (freeblks <= 0)
2455-
return -ENOSPC;
2456-
resblks = min_t(int64_t, UINT_MAX, freeblks);
2457-
resblks = (resblks * 15) >> 4;
2458-
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
2459-
0, XFS_TRANS_RESERVE, &tp);
2460-
if (error)
2461-
return error;
2462-
/* transfer all collected dfops to this transaction */
2463-
xfs_defer_move(tp, parent_tp);
2446+
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2447+
/*
2448+
* We're finishing the defer_ops that accumulated as a result
2449+
* of recovering unfinished intent items during log recovery.
2450+
* We reserve an itruncate transaction because it is the
2451+
* largest permanent transaction type. Since we're the only
2452+
* user of the fs right now, take 93% (15/16) of the available
2453+
* free blocks. Use weird math to avoid a 64-bit division.
2454+
*/
2455+
freeblks = percpu_counter_sum(&mp->m_fdblocks);
2456+
if (freeblks <= 0)
2457+
return -ENOSPC;
2458+
2459+
resblks = min_t(uint64_t, UINT_MAX, freeblks);
2460+
resblks = (resblks * 15) >> 4;
2461+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
2462+
0, XFS_TRANS_RESERVE, &tp);
2463+
if (error)
2464+
return error;
2465+
2466+
/*
2467+
* Transfer to this new transaction all the dfops we captured
2468+
* from recovering a single intent item.
2469+
*/
2470+
list_del_init(&dfc->dfc_list);
2471+
xfs_defer_ops_continue(dfc, tp);
2472+
2473+
error = xfs_trans_commit(tp);
2474+
if (error)
2475+
return error;
2476+
}
24642477

2465-
return xfs_trans_commit(tp);
2478+
ASSERT(list_empty(capture_list));
2479+
return 0;
24662480
}
24672481

2482+
/* Release all the captured defer ops and capture structures in this list. */
2483+
static void
2484+
xlog_abort_defer_ops(
2485+
struct xfs_mount *mp,
2486+
struct list_head *capture_list)
2487+
{
2488+
struct xfs_defer_capture *dfc;
2489+
struct xfs_defer_capture *next;
2490+
2491+
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2492+
list_del_init(&dfc->dfc_list);
2493+
xfs_defer_ops_release(mp, dfc);
2494+
}
2495+
}
24682496
/*
24692497
* When this is called, all of the log intent items which did not have
24702498
* corresponding log done items should be in the AIL. What we do now
@@ -2485,35 +2513,23 @@ STATIC int
24852513
xlog_recover_process_intents(
24862514
struct xlog *log)
24872515
{
2488-
struct xfs_trans *parent_tp;
2516+
LIST_HEAD(capture_list);
24892517
struct xfs_ail_cursor cur;
24902518
struct xfs_log_item *lip;
24912519
struct xfs_ail *ailp;
2492-
int error;
2520+
int error = 0;
24932521
#if defined(DEBUG) || defined(XFS_WARN)
24942522
xfs_lsn_t last_lsn;
24952523
#endif
24962524

2497-
/*
2498-
* The intent recovery handlers commit transactions to complete recovery
2499-
* for individual intents, but any new deferred operations that are
2500-
* queued during that process are held off until the very end. The
2501-
* purpose of this transaction is to serve as a container for deferred
2502-
* operations. Each intent recovery handler must transfer dfops here
2503-
* before its local transaction commits, and we'll finish the entire
2504-
* list below.
2505-
*/
2506-
error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
2507-
if (error)
2508-
return error;
2509-
25102525
ailp = log->l_ailp;
25112526
spin_lock(&ailp->ail_lock);
2512-
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
25132527
#if defined(DEBUG) || defined(XFS_WARN)
25142528
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
25152529
#endif
2516-
while (lip != NULL) {
2530+
for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2531+
lip != NULL;
2532+
lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
25172533
/*
25182534
* We're done when we see something other than an intent.
25192535
* There should be no intents left in the AIL now.
@@ -2535,24 +2551,29 @@ xlog_recover_process_intents(
25352551

25362552
/*
25372553
* NOTE: If your intent processing routine can create more
2538-
* deferred ops, you /must/ attach them to the transaction in
2539-
* this routine or else those subsequent intents will get
2554+
* deferred ops, you /must/ attach them to the capture list in
2555+
* the recover routine or else those subsequent intents will be
25402556
* replayed in the wrong order!
25412557
*/
25422558
spin_unlock(&ailp->ail_lock);
2543-
error = lip->li_ops->iop_recover(lip, parent_tp);
2559+
error = lip->li_ops->iop_recover(lip, &capture_list);
25442560
spin_lock(&ailp->ail_lock);
25452561
if (error)
2546-
goto out;
2547-
lip = xfs_trans_ail_cursor_next(ailp, &cur);
2562+
break;
25482563
}
2549-
out:
2564+
25502565
xfs_trans_ail_cursor_done(&cur);
25512566
spin_unlock(&ailp->ail_lock);
2552-
if (!error)
2553-
error = xlog_finish_defer_ops(parent_tp);
2554-
xfs_trans_cancel(parent_tp);
2567+
if (error)
2568+
goto err;
25552569

2570+
error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2571+
if (error)
2572+
goto err;
2573+
2574+
return 0;
2575+
err:
2576+
xlog_abort_defer_ops(log->l_mp, &capture_list);
25562577
return error;
25572578
}
25582579

0 commit comments

Comments
 (0)