Skip to content

Commit 3129946

Browse files
committed
Merge patch series "fs: allow detached mounts in clone_private_mount()"
Christian Brauner <[email protected]> says: In container workloads idmapped mounts are often used as layers for overlayfs. Recently I added the ability to specify layers in overlayfs as file descriptors instead of path names. It should be possible to simply use the detached mounts directly when specifying layers instead of having to attach them beforehand. They are discarded after overlayfs is mounted anyway so it's pointless system calls for userspace and pointless locking for the kernel. This just recently come up again in [1]. So enable clone_private_mount() to use detached mounts directly. Following conditions must be met: - Provided path must be the root of a detached mount tree. - Provided path may not create mount namespace loops. - Provided path must be mounted. It would be possible to be stricter and require that the caller must have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount namespace but since this restriction isn't enforced for move_mount() there's no point in enforcing it for clone_private_mount(). * patches from https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner: selftests: add tests for using detached mount with overlayfs fs: allow detached mounts in clone_private_mount() Link: https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner Signed-off-by: Christian Brauner <[email protected]>
2 parents 29349a3 + ccc829b commit 3129946

File tree

3 files changed

+190
-35
lines changed

3 files changed

+190
-35
lines changed

fs/namespace.c

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2369,6 +2369,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
23692369
return false;
23702370
}
23712371

2372+
/*
2373+
* Check that there aren't references to earlier/same mount namespaces in the
2374+
* specified subtree. Such references can act as pins for mount namespaces
2375+
* that aren't checked by the mount-cycle checking code, thereby allowing
2376+
* cycles to be made.
2377+
*/
2378+
static bool check_for_nsfs_mounts(struct mount *subtree)
2379+
{
2380+
struct mount *p;
2381+
bool ret = false;
2382+
2383+
lock_mount_hash();
2384+
for (p = subtree; p; p = next_mnt(p, subtree))
2385+
if (mnt_ns_loop(p->mnt.mnt_root))
2386+
goto out;
2387+
2388+
ret = true;
2389+
out:
2390+
unlock_mount_hash();
2391+
return ret;
2392+
}
2393+
23722394
/**
23732395
* clone_private_mount - create a private clone of a path
23742396
* @path: path to clone
@@ -2377,37 +2399,45 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
23772399
* will not be attached anywhere in the namespace and will be private (i.e.
23782400
* changes to the originating mount won't be propagated into this).
23792401
*
2402+
* This assumes caller has called or done the equivalent of may_mount().
2403+
*
23802404
* Release with mntput().
23812405
*/
23822406
struct vfsmount *clone_private_mount(const struct path *path)
23832407
{
23842408
struct mount *old_mnt = real_mount(path->mnt);
23852409
struct mount *new_mnt;
23862410

2387-
down_read(&namespace_sem);
2411+
scoped_guard(rwsem_read, &namespace_sem)
23882412
if (IS_MNT_UNBINDABLE(old_mnt))
2389-
goto invalid;
2413+
return ERR_PTR(-EINVAL);
2414+
2415+
if (mnt_has_parent(old_mnt)) {
2416+
if (!check_mnt(old_mnt))
2417+
return ERR_PTR(-EINVAL);
2418+
} else {
2419+
if (!is_mounted(&old_mnt->mnt))
2420+
return ERR_PTR(-EINVAL);
23902421

2391-
if (!check_mnt(old_mnt))
2392-
goto invalid;
2422+
/* Make sure this isn't something purely kernel internal. */
2423+
if (!is_anon_ns(old_mnt->mnt_ns))
2424+
return ERR_PTR(-EINVAL);
2425+
2426+
/* Make sure we don't create mount namespace loops. */
2427+
if (!check_for_nsfs_mounts(old_mnt))
2428+
return ERR_PTR(-EINVAL);
2429+
}
23932430

23942431
if (has_locked_children(old_mnt, path->dentry))
2395-
goto invalid;
2432+
return ERR_PTR(-EINVAL);
23962433

23972434
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2398-
up_read(&namespace_sem);
2399-
24002435
if (IS_ERR(new_mnt))
2401-
return ERR_CAST(new_mnt);
2436+
return ERR_PTR(-EINVAL);
24022437

24032438
/* Longterm mount to be removed by kern_unmount*() */
24042439
new_mnt->mnt_ns = MNT_NS_INTERNAL;
2405-
24062440
return &new_mnt->mnt;
2407-
2408-
invalid:
2409-
up_read(&namespace_sem);
2410-
return ERR_PTR(-EINVAL);
24112441
}
24122442
EXPORT_SYMBOL_GPL(clone_private_mount);
24132443

@@ -3206,28 +3236,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
32063236
return 0;
32073237
}
32083238

3209-
/*
3210-
* Check that there aren't references to earlier/same mount namespaces in the
3211-
* specified subtree. Such references can act as pins for mount namespaces
3212-
* that aren't checked by the mount-cycle checking code, thereby allowing
3213-
* cycles to be made.
3214-
*/
3215-
static bool check_for_nsfs_mounts(struct mount *subtree)
3216-
{
3217-
struct mount *p;
3218-
bool ret = false;
3219-
3220-
lock_mount_hash();
3221-
for (p = subtree; p; p = next_mnt(p, subtree))
3222-
if (mnt_ns_loop(p->mnt.mnt_root))
3223-
goto out;
3224-
3225-
ret = true;
3226-
out:
3227-
unlock_mount_hash();
3228-
return ret;
3229-
}
3230-
32313239
static int do_set_group(struct path *from_path, struct path *to_path)
32323240
{
32333241
struct mount *from, *to;

tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@ FIXTURE(set_layers_via_fds) {
2020
FIXTURE_SETUP(set_layers_via_fds)
2121
{
2222
ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
23+
ASSERT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0);
2324
}
2425

2526
FIXTURE_TEARDOWN(set_layers_via_fds)
2627
{
2728
umount2("/set_layers_via_fds", 0);
2829
ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
30+
31+
umount2("/set_layers_via_fds_tmpfs", 0);
32+
ASSERT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0);
2933
}
3034

3135
TEST_F(set_layers_via_fds, set_layers_via_fds)
@@ -279,4 +283,130 @@ TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds)
279283
ASSERT_EQ(close(fd_overlay), 0);
280284
}
281285

286+
TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds)
287+
{
288+
int fd_context, fd_tmpfs, fd_overlay, fd_tmp;
289+
int layer_fds[] = { [0 ... 8] = -EBADF };
290+
bool layers_found[] = { [0 ... 8] = false };
291+
size_t len = 0;
292+
char *line = NULL;
293+
FILE *f_mountinfo;
294+
295+
ASSERT_EQ(unshare(CLONE_NEWNS), 0);
296+
ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
297+
298+
fd_context = sys_fsopen("tmpfs", 0);
299+
ASSERT_GE(fd_context, 0);
300+
301+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
302+
fd_tmpfs = sys_fsmount(fd_context, 0, 0);
303+
ASSERT_GE(fd_tmpfs, 0);
304+
ASSERT_EQ(close(fd_context), 0);
305+
306+
ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
307+
ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0);
308+
ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0);
309+
ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
310+
ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
311+
ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
312+
ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
313+
ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
314+
ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
315+
ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);
316+
317+
ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0);
318+
319+
fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
320+
ASSERT_GE(fd_tmp, 0);
321+
322+
layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH);
323+
ASSERT_GE(layer_fds[0], 0);
324+
325+
layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH);
326+
ASSERT_GE(layer_fds[1], 0);
327+
328+
layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
329+
ASSERT_GE(layer_fds[2], 0);
330+
331+
layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
332+
ASSERT_GE(layer_fds[3], 0);
333+
334+
layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
335+
ASSERT_GE(layer_fds[4], 0);
336+
337+
layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
338+
ASSERT_GE(layer_fds[5], 0);
339+
340+
layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
341+
ASSERT_GE(layer_fds[6], 0);
342+
343+
layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
344+
ASSERT_GE(layer_fds[7], 0);
345+
346+
layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
347+
ASSERT_GE(layer_fds[8], 0);
348+
349+
ASSERT_EQ(close(fd_tmpfs), 0);
350+
351+
fd_context = sys_fsopen("overlay", 0);
352+
ASSERT_GE(fd_context, 0);
353+
354+
ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
355+
356+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0);
357+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0);
358+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
359+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
360+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
361+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
362+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0);
363+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0);
364+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0);
365+
366+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
367+
368+
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
369+
370+
fd_overlay = sys_fsmount(fd_context, 0, 0);
371+
ASSERT_GE(fd_overlay, 0);
372+
373+
ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
374+
375+
f_mountinfo = fopen("/proc/self/mountinfo", "r");
376+
ASSERT_NE(f_mountinfo, NULL);
377+
378+
while (getline(&line, &len, f_mountinfo) != -1) {
379+
char *haystack = line;
380+
381+
if (strstr(haystack, "workdir=/tmp/w"))
382+
layers_found[0] = true;
383+
if (strstr(haystack, "upperdir=/tmp/u"))
384+
layers_found[1] = true;
385+
if (strstr(haystack, "lowerdir+=/tmp/l1"))
386+
layers_found[2] = true;
387+
if (strstr(haystack, "lowerdir+=/tmp/l2"))
388+
layers_found[3] = true;
389+
if (strstr(haystack, "lowerdir+=/tmp/l3"))
390+
layers_found[4] = true;
391+
if (strstr(haystack, "lowerdir+=/tmp/l4"))
392+
layers_found[5] = true;
393+
if (strstr(haystack, "datadir+=/tmp/d1"))
394+
layers_found[6] = true;
395+
if (strstr(haystack, "datadir+=/tmp/d2"))
396+
layers_found[7] = true;
397+
if (strstr(haystack, "datadir+=/tmp/d3"))
398+
layers_found[8] = true;
399+
}
400+
free(line);
401+
402+
for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
403+
ASSERT_EQ(layers_found[i], true);
404+
ASSERT_EQ(close(layer_fds[i]), 0);
405+
}
406+
407+
ASSERT_EQ(close(fd_context), 0);
408+
ASSERT_EQ(close(fd_overlay), 0);
409+
ASSERT_EQ(fclose(f_mountinfo), 0);
410+
}
411+
282412
TEST_HARNESS_MAIN

tools/testing/selftests/filesystems/overlayfs/wrappers.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,21 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname,
4444
to_pathname, flags);
4545
}
4646

47+
#ifndef OPEN_TREE_CLONE
48+
#define OPEN_TREE_CLONE 1
49+
#endif
50+
51+
#ifndef OPEN_TREE_CLOEXEC
52+
#define OPEN_TREE_CLOEXEC O_CLOEXEC
53+
#endif
54+
55+
#ifndef AT_RECURSIVE
56+
#define AT_RECURSIVE 0x8000
57+
#endif
58+
59+
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
60+
{
61+
return syscall(__NR_open_tree, dfd, filename, flags);
62+
}
63+
4764
#endif

0 commit comments

Comments
 (0)