Skip to content

Commit 50d5c8d

Browse files
Dave ChinnerBen Myers
authored andcommitted
xfs: check LSN ordering for v5 superblocks during recovery
Log recovery has some strict ordering requirements which unordered or reordered metadata writeback can defeat. This can occur when an item is logged in a transaction, written back to disk, and then logged in a new transaction before the tail of the log is moved past the original modification. The result of this is that when we read an object off disk for recovery purposes, the buffer that we read may not contain the object type that recovery is expecting and hence at the end of the checkpoint being recovered we have an invalid object in memory. This isn't usually a problem, as recovery will then replay all the other checkpoints and that brings the object back to a valid and correct state, but the issue is that while the object is in the invalid state it can be flushed to disk. This results in the object verifier failing and triggering a corruption shutdown of log recover. This is correct behaviour for the verifiers - the problem is that we are not detecting that the object we've read off disk is newer than the transaction we are replaying. All metadata in v5 filesystems has the LSN of it's last modification stamped in it. This enabled log recover to read that field and determine the age of the object on disk correctly. If the LSN of the object on disk is older than the transaction being replayed, then we replay the modification. If the LSN of the object matches or is more recent than the transaction's LSN, then we should avoid overwriting the object as that is what leads to the transient corrupt state. Signed-off-by: Dave Chinner <[email protected]> Reviewed-by: Mark Tinguely <[email protected]> Signed-off-by: Ben Myers <[email protected]>
1 parent b58fa55 commit 50d5c8d

File tree

1 file changed

+156
-13
lines changed

1 file changed

+156
-13
lines changed

fs/xfs/xfs_log_recover.c

Lines changed: 156 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1958,6 +1958,104 @@ xlog_recover_do_inode_buffer(
19581958
return 0;
19591959
}
19601960

1961+
/*
1962+
* V5 filesystems know the age of the buffer on disk being recovered. We can
1963+
* have newer objects on disk than we are replaying, and so for these cases we
1964+
* don't want to replay the current change as that will make the buffer contents
1965+
* temporarily invalid on disk.
1966+
*
1967+
* The magic number might not match the buffer type we are going to recover
1968+
* (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
1969+
* extract the LSN of the existing object in the buffer based on it's current
1970+
* magic number. If we don't recognise the magic number in the buffer, then
1971+
* return a LSN of -1 so that the caller knows it was an unrecognised block and
1972+
* so can recover the buffer.
1973+
*/
1974+
static xfs_lsn_t
1975+
xlog_recover_get_buf_lsn(
1976+
struct xfs_mount *mp,
1977+
struct xfs_buf *bp)
1978+
{
1979+
__uint32_t magic32;
1980+
__uint16_t magic16;
1981+
__uint16_t magicda;
1982+
void *blk = bp->b_addr;
1983+
1984+
/* v4 filesystems always recover immediately */
1985+
if (!xfs_sb_version_hascrc(&mp->m_sb))
1986+
goto recover_immediately;
1987+
1988+
magic32 = be32_to_cpu(*(__be32 *)blk);
1989+
switch (magic32) {
1990+
case XFS_ABTB_CRC_MAGIC:
1991+
case XFS_ABTC_CRC_MAGIC:
1992+
case XFS_ABTB_MAGIC:
1993+
case XFS_ABTC_MAGIC:
1994+
case XFS_IBT_CRC_MAGIC:
1995+
case XFS_IBT_MAGIC:
1996+
return be64_to_cpu(
1997+
((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
1998+
case XFS_BMAP_CRC_MAGIC:
1999+
case XFS_BMAP_MAGIC:
2000+
return be64_to_cpu(
2001+
((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
2002+
case XFS_AGF_MAGIC:
2003+
return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2004+
case XFS_AGFL_MAGIC:
2005+
return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2006+
case XFS_AGI_MAGIC:
2007+
return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2008+
case XFS_SYMLINK_MAGIC:
2009+
return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2010+
case XFS_DIR3_BLOCK_MAGIC:
2011+
case XFS_DIR3_DATA_MAGIC:
2012+
case XFS_DIR3_FREE_MAGIC:
2013+
return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2014+
case XFS_ATTR3_RMT_MAGIC:
2015+
return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
2016+
case XFS_SB_MAGIC:
2017+
return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
2018+
default:
2019+
break;
2020+
}
2021+
2022+
magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2023+
switch (magicda) {
2024+
case XFS_DIR3_LEAF1_MAGIC:
2025+
case XFS_DIR3_LEAFN_MAGIC:
2026+
case XFS_DA3_NODE_MAGIC:
2027+
return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2028+
default:
2029+
break;
2030+
}
2031+
2032+
/*
2033+
* We do individual object checks on dquot and inode buffers as they
2034+
* have their own individual LSN records. Also, we could have a stale
2035+
* buffer here, so we have to at least recognise these buffer types.
2036+
*
2037+
* A notd complexity here is inode unlinked list processing - it logs
2038+
* the inode directly in the buffer, but we don't know which inodes have
2039+
* been modified, and there is no global buffer LSN. Hence we need to
2040+
* recover all inode buffer types immediately. This problem will be
2041+
* fixed by logical logging of the unlinked list modifications.
2042+
*/
2043+
magic16 = be16_to_cpu(*(__be16 *)blk);
2044+
switch (magic16) {
2045+
case XFS_DQUOT_MAGIC:
2046+
case XFS_DINODE_MAGIC:
2047+
goto recover_immediately;
2048+
default:
2049+
break;
2050+
}
2051+
2052+
/* unknown buffer contents, recover immediately */
2053+
2054+
recover_immediately:
2055+
return (xfs_lsn_t)-1;
2056+
2057+
}
2058+
19612059
/*
19622060
* Validate the recovered buffer is of the correct type and attach the
19632061
* appropriate buffer operations to them for writeback. Magic numbers are in a
@@ -1967,7 +2065,7 @@ xlog_recover_do_inode_buffer(
19672065
* inside a struct xfs_da_blkinfo at the start of the buffer.
19682066
*/
19692067
static void
1970-
xlog_recovery_validate_buf_type(
2068+
xlog_recover_validate_buf_type(
19712069
struct xfs_mount *mp,
19722070
struct xfs_buf *bp,
19732071
xfs_buf_log_format_t *buf_f)
@@ -2246,7 +2344,7 @@ xlog_recover_do_reg_buffer(
22462344
* just avoid the verification stage for non-crc filesystems
22472345
*/
22482346
if (xfs_sb_version_hascrc(&mp->m_sb))
2249-
xlog_recovery_validate_buf_type(mp, bp, buf_f);
2347+
xlog_recover_validate_buf_type(mp, bp, buf_f);
22502348
}
22512349

22522350
/*
@@ -2444,13 +2542,15 @@ STATIC int
24442542
xlog_recover_buffer_pass2(
24452543
struct xlog *log,
24462544
struct list_head *buffer_list,
2447-
struct xlog_recover_item *item)
2545+
struct xlog_recover_item *item,
2546+
xfs_lsn_t current_lsn)
24482547
{
24492548
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
24502549
xfs_mount_t *mp = log->l_mp;
24512550
xfs_buf_t *bp;
24522551
int error;
24532552
uint buf_flags;
2553+
xfs_lsn_t lsn;
24542554

24552555
/*
24562556
* In this pass we only want to recover all the buffers which have
@@ -2475,10 +2575,17 @@ xlog_recover_buffer_pass2(
24752575
error = bp->b_error;
24762576
if (error) {
24772577
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2478-
xfs_buf_relse(bp);
2479-
return error;
2578+
goto out_release;
24802579
}
24812580

2581+
/*
2582+
* recover the buffer only if we get an LSN from it and it's less than
2583+
* the lsn of the transaction we are replaying.
2584+
*/
2585+
lsn = xlog_recover_get_buf_lsn(mp, bp);
2586+
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
2587+
goto out_release;
2588+
24822589
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
24832590
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
24842591
} else if (buf_f->blf_flags &
@@ -2488,7 +2595,7 @@ xlog_recover_buffer_pass2(
24882595
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
24892596
}
24902597
if (error)
2491-
return XFS_ERROR(error);
2598+
goto out_release;
24922599

24932600
/*
24942601
* Perform delayed write on the buffer. Asynchronous writes will be
@@ -2517,6 +2624,7 @@ xlog_recover_buffer_pass2(
25172624
xfs_buf_delwri_queue(bp, buffer_list);
25182625
}
25192626

2627+
out_release:
25202628
xfs_buf_relse(bp);
25212629
return error;
25222630
}
@@ -2525,7 +2633,8 @@ STATIC int
25252633
xlog_recover_inode_pass2(
25262634
struct xlog *log,
25272635
struct list_head *buffer_list,
2528-
struct xlog_recover_item *item)
2636+
struct xlog_recover_item *item,
2637+
xfs_lsn_t current_lsn)
25292638
{
25302639
xfs_inode_log_format_t *in_f;
25312640
xfs_mount_t *mp = log->l_mp;
@@ -2604,6 +2713,20 @@ xlog_recover_inode_pass2(
26042713
goto error;
26052714
}
26062715

2716+
/*
2717+
* If the inode has an LSN in it, recover the inode only if it's less
2718+
* than the lsn of the transaction we are replaying.
2719+
*/
2720+
if (dip->di_version >= 3) {
2721+
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
2722+
2723+
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2724+
trace_xfs_log_recover_inode_skip(log, in_f);
2725+
error = 0;
2726+
goto out_release;
2727+
}
2728+
}
2729+
26072730
/*
26082731
* di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
26092732
* are transactional and if ordering is necessary we can determine that
@@ -2793,6 +2916,8 @@ xlog_recover_inode_pass2(
27932916
ASSERT(bp->b_target->bt_mount == mp);
27942917
bp->b_iodone = xlog_recover_iodone;
27952918
xfs_buf_delwri_queue(bp, buffer_list);
2919+
2920+
out_release:
27962921
xfs_buf_relse(bp);
27972922
error:
27982923
if (need_free)
@@ -2834,7 +2959,8 @@ STATIC int
28342959
xlog_recover_dquot_pass2(
28352960
struct xlog *log,
28362961
struct list_head *buffer_list,
2837-
struct xlog_recover_item *item)
2962+
struct xlog_recover_item *item,
2963+
xfs_lsn_t current_lsn)
28382964
{
28392965
xfs_mount_t *mp = log->l_mp;
28402966
xfs_buf_t *bp;
@@ -2908,6 +3034,19 @@ xlog_recover_dquot_pass2(
29083034
return XFS_ERROR(EIO);
29093035
}
29103036

3037+
/*
3038+
* If the dquot has an LSN in it, recover the dquot only if it's less
3039+
* than the lsn of the transaction we are replaying.
3040+
*/
3041+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
3042+
struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3043+
xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3044+
3045+
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3046+
goto out_release;
3047+
}
3048+
}
3049+
29113050
memcpy(ddq, recddq, item->ri_buf[1].i_len);
29123051
if (xfs_sb_version_hascrc(&mp->m_sb)) {
29133052
xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2918,9 +3057,10 @@ xlog_recover_dquot_pass2(
29183057
ASSERT(bp->b_target->bt_mount == mp);
29193058
bp->b_iodone = xlog_recover_iodone;
29203059
xfs_buf_delwri_queue(bp, buffer_list);
2921-
xfs_buf_relse(bp);
29223060

2923-
return (0);
3061+
out_release:
3062+
xfs_buf_relse(bp);
3063+
return 0;
29243064
}
29253065

29263066
/*
@@ -3267,15 +3407,18 @@ xlog_recover_commit_pass2(
32673407

32683408
switch (ITEM_TYPE(item)) {
32693409
case XFS_LI_BUF:
3270-
return xlog_recover_buffer_pass2(log, buffer_list, item);
3410+
return xlog_recover_buffer_pass2(log, buffer_list, item,
3411+
trans->r_lsn);
32713412
case XFS_LI_INODE:
3272-
return xlog_recover_inode_pass2(log, buffer_list, item);
3413+
return xlog_recover_inode_pass2(log, buffer_list, item,
3414+
trans->r_lsn);
32733415
case XFS_LI_EFI:
32743416
return xlog_recover_efi_pass2(log, item, trans->r_lsn);
32753417
case XFS_LI_EFD:
32763418
return xlog_recover_efd_pass2(log, item);
32773419
case XFS_LI_DQUOT:
3278-
return xlog_recover_dquot_pass2(log, buffer_list, item);
3420+
return xlog_recover_dquot_pass2(log, buffer_list, item,
3421+
trans->r_lsn);
32793422
case XFS_LI_ICREATE:
32803423
return xlog_recover_do_icreate_pass2(log, buffer_list, item);
32813424
case XFS_LI_QUOTAOFF:

0 commit comments

Comments
 (0)