Skip to content

Commit 3649833

Browse files
boryaskdave
authored andcommitted
btrfs: fix broken drop_caches on extent buffer folios
The (correct) commit e41c81d ("mm/truncate: Replace page_mapped() call in invalidate_inode_page()") replaced the page_mapped(page) check with a refcount check. However, this refcount check does not work as expected with drop_caches for btrfs's metadata pages. Btrfs has a per-sb metadata inode with cached pages, and when not in active use by btrfs, they have a refcount of 3. One from the initial call to alloc_pages(), one (nr_pages == 1) from filemap_add_folio(), and one from folio_attach_private(). We would expect such pages to get dropped by drop_caches. However, drop_caches calls into mapping_evict_folio() via mapping_try_invalidate() which gets a reference on the folio with find_lock_entries(). As a result, these pages have a refcount of 4, and fail this check. For what it's worth, such pages do get reclaimed under memory pressure, so I would say that while this behavior is surprising, it is not really dangerously broken. When I asked the mm folks about the expected refcount in this case, I was told that the correct thing to do is to donate the refcount from the original allocation to the page cache after inserting it. Therefore, attempt to fix this by adding a put_folio() to the critical spot in alloc_extent_buffer() where we are sure that we have really allocated and attached new pages. We must also adjust folio_detach_private() to properly handle being the last reference to the folio and not do a use-after-free after folio_detach_private(). extent_buffers allocated by clone_extent_buffer() and alloc_dummy_extent_buffer() are unmapped, so this transfer of ownership from allocation to insertion in the mapping does not apply to them. However, we can still folio_put() them safely once they are finished being allocated and have called folio_attach_private(). Finally, removing the generic put_folio() for the allocation from btrfs_detach_extent_buffer_folios() means we need to be careful to do the appropriate put_folio() in allocation failure paths in alloc_extent_buffer(), clone_extent_buffer() and alloc_dummy_extent_buffer(). Link: https://lore.kernel.org/linux-mm/[email protected]/ Tested-by: Klara Modin <[email protected]> Reviewed-by: Daniel Vacek <[email protected]> Reviewed-by: Filipe Manana <[email protected]> Signed-off-by: Boris Burkov <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent 1886b77 commit 3649833

File tree

1 file changed

+71
-45
lines changed

1 file changed

+71
-45
lines changed

fs/btrfs/extent_io.c

Lines changed: 71 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2747,18 +2747,19 @@ static bool folio_range_has_eb(struct folio *folio)
27472747
static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
27482748
{
27492749
struct btrfs_fs_info *fs_info = eb->fs_info;
2750+
struct address_space *mapping = folio->mapping;
27502751
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
27512752

27522753
/*
27532754
* For mapped eb, we're going to change the folio private, which should
27542755
* be done under the i_private_lock.
27552756
*/
27562757
if (mapped)
2757-
spin_lock(&folio->mapping->i_private_lock);
2758+
spin_lock(&mapping->i_private_lock);
27582759

27592760
if (!folio_test_private(folio)) {
27602761
if (mapped)
2761-
spin_unlock(&folio->mapping->i_private_lock);
2762+
spin_unlock(&mapping->i_private_lock);
27622763
return;
27632764
}
27642765

@@ -2777,7 +2778,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
27772778
folio_detach_private(folio);
27782779
}
27792780
if (mapped)
2780-
spin_unlock(&folio->mapping->i_private_lock);
2781+
spin_unlock(&mapping->i_private_lock);
27812782
return;
27822783
}
27832784

@@ -2800,7 +2801,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
28002801
if (!folio_range_has_eb(folio))
28012802
btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
28022803

2803-
spin_unlock(&folio->mapping->i_private_lock);
2804+
spin_unlock(&mapping->i_private_lock);
28042805
}
28052806

28062807
/* Release all folios attached to the extent buffer */
@@ -2815,9 +2816,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
28152816
continue;
28162817

28172818
detach_extent_buffer_folio(eb, folio);
2818-
2819-
/* One for when we allocated the folio. */
2820-
folio_put(folio);
28212819
}
28222820
}
28232821

@@ -2852,9 +2850,28 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
28522850
return eb;
28532851
}
28542852

2853+
/*
2854+
* For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer()
2855+
* does not call folio_put(), and we need to set the folios to NULL so that
2856+
* btrfs_release_extent_buffer() will not detach them a second time.
2857+
*/
2858+
static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
2859+
{
2860+
const int num_folios = num_extent_folios(eb);
2861+
2862+
/* We canont use num_extent_folios() as loop bound as eb->folios changes. */
2863+
for (int i = 0; i < num_folios; i++) {
2864+
ASSERT(eb->folios[i]);
2865+
detach_extent_buffer_folio(eb, eb->folios[i]);
2866+
folio_put(eb->folios[i]);
2867+
eb->folios[i] = NULL;
2868+
}
2869+
}
2870+
28552871
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
28562872
{
28572873
struct extent_buffer *new;
2874+
int num_folios;
28582875
int ret;
28592876

28602877
new = __alloc_extent_buffer(src->fs_info, src->start);
@@ -2869,25 +2886,34 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
28692886
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
28702887

28712888
ret = alloc_eb_folio_array(new, false);
2872-
if (ret) {
2873-
btrfs_release_extent_buffer(new);
2874-
return NULL;
2875-
}
2889+
if (ret)
2890+
goto release_eb;
28762891

2877-
for (int i = 0; i < num_extent_folios(src); i++) {
2892+
ASSERT(num_extent_folios(src) == num_extent_folios(new),
2893+
"%d != %d", num_extent_folios(src), num_extent_folios(new));
2894+
/* Explicitly use the cached num_extent value from now on. */
2895+
num_folios = num_extent_folios(src);
2896+
for (int i = 0; i < num_folios; i++) {
28782897
struct folio *folio = new->folios[i];
28792898

28802899
ret = attach_extent_buffer_folio(new, folio, NULL);
2881-
if (ret < 0) {
2882-
btrfs_release_extent_buffer(new);
2883-
return NULL;
2884-
}
2900+
if (ret < 0)
2901+
goto cleanup_folios;
28852902
WARN_ON(folio_test_dirty(folio));
28862903
}
2904+
for (int i = 0; i < num_folios; i++)
2905+
folio_put(new->folios[i]);
2906+
28872907
copy_extent_buffer_full(new, src);
28882908
set_extent_buffer_uptodate(new);
28892909

28902910
return new;
2911+
2912+
cleanup_folios:
2913+
cleanup_extent_buffer_folios(new);
2914+
release_eb:
2915+
btrfs_release_extent_buffer(new);
2916+
return NULL;
28912917
}
28922918

28932919
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -2902,29 +2928,26 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
29022928

29032929
ret = alloc_eb_folio_array(eb, false);
29042930
if (ret)
2905-
goto out;
2931+
goto release_eb;
29062932

29072933
for (int i = 0; i < num_extent_folios(eb); i++) {
29082934
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
29092935
if (ret < 0)
2910-
goto out_detach;
2936+
goto cleanup_folios;
29112937
}
2938+
for (int i = 0; i < num_extent_folios(eb); i++)
2939+
folio_put(eb->folios[i]);
29122940

29132941
set_extent_buffer_uptodate(eb);
29142942
btrfs_set_header_nritems(eb, 0);
29152943
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
29162944

29172945
return eb;
29182946

2919-
out_detach:
2920-
for (int i = 0; i < num_extent_folios(eb); i++) {
2921-
if (eb->folios[i]) {
2922-
detach_extent_buffer_folio(eb, eb->folios[i]);
2923-
folio_put(eb->folios[i]);
2924-
}
2925-
}
2926-
out:
2927-
kmem_cache_free(extent_buffer_cache, eb);
2947+
cleanup_folios:
2948+
cleanup_extent_buffer_folios(eb);
2949+
release_eb:
2950+
btrfs_release_extent_buffer(eb);
29282951
return NULL;
29292952
}
29302953

@@ -3357,8 +3380,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
33573380
* btree_release_folio will correctly detect that a page belongs to a
33583381
* live buffer and won't free them prematurely.
33593382
*/
3360-
for (int i = 0; i < num_extent_folios(eb); i++)
3383+
for (int i = 0; i < num_extent_folios(eb); i++) {
33613384
folio_unlock(eb->folios[i]);
3385+
/*
3386+
* A folio that has been added to an address_space mapping
3387+
* should not continue holding the refcount from its original
3388+
* allocation indefinitely.
3389+
*/
3390+
folio_put(eb->folios[i]);
3391+
}
33623392
return eb;
33633393

33643394
out:
@@ -3372,26 +3402,22 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
33723402
* want that to grab this eb, as we're getting ready to free it. So we
33733403
* have to detach it first and then unlock it.
33743404
*
3375-
* We have to drop our reference and NULL it out here because in the
3376-
* subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
3377-
* Below when we call btrfs_release_extent_buffer() we will call
3378-
* detach_extent_buffer_folio() on our remaining pages in the !subpage
3379-
* case. If we left eb->folios[i] populated in the subpage case we'd
3380-
* double put our reference and be super sad.
3405+
* Note: the bounds is num_extent_pages() as we need to go through all slots.
33813406
*/
3382-
for (int i = 0; i < attached; i++) {
3383-
ASSERT(eb->folios[i]);
3384-
detach_extent_buffer_folio(eb, eb->folios[i]);
3385-
folio_unlock(eb->folios[i]);
3386-
folio_put(eb->folios[i]);
3407+
for (int i = 0; i < num_extent_pages(eb); i++) {
3408+
struct folio *folio = eb->folios[i];
3409+
3410+
if (i < attached) {
3411+
ASSERT(folio);
3412+
detach_extent_buffer_folio(eb, folio);
3413+
folio_unlock(folio);
3414+
} else if (!folio) {
3415+
continue;
3416+
}
3417+
3418+
folio_put(folio);
33873419
eb->folios[i] = NULL;
33883420
}
3389-
/*
3390-
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
3391-
* so it can be cleaned up without utilizing folio->mapping.
3392-
*/
3393-
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
3394-
33953421
btrfs_release_extent_buffer(eb);
33963422
if (ret < 0)
33973423
return ERR_PTR(ret);

0 commit comments

Comments
 (0)