Skip to content

Commit 49cb2fc

Browse files
adrianreberChristian Brauner
authored andcommitted
fork: extend clone3() to support setting a PID
The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber <[email protected]> Reviewed-by: Christian Brauner <[email protected]> Reviewed-by: Oleg Nesterov <[email protected]> Reviewed-by: Dmitry Safonov <[email protected]> Acked-by: Andrei Vagin <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Christian Brauner <[email protected]>
1 parent 17a8106 commit 49cb2fc

File tree

7 files changed

+122
-37
lines changed

7 files changed

+122
-37
lines changed

include/linux/pid.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr);
124124
extern struct pid *find_get_pid(int nr);
125125
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
126126

127-
extern struct pid *alloc_pid(struct pid_namespace *ns);
127+
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
128+
size_t set_tid_size);
128129
extern void free_pid(struct pid *pid);
129130
extern void disable_pid_allocation(struct pid_namespace *ns);
130131

include/linux/pid_namespace.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <linux/ns_common.h>
1313
#include <linux/idr.h>
1414

15+
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
16+
#define MAX_PID_NS_LEVEL 32
1517

1618
struct fs_pin;
1719

include/linux/sched/task.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ struct kernel_clone_args {
2626
unsigned long stack;
2727
unsigned long stack_size;
2828
unsigned long tls;
29+
pid_t *set_tid;
30+
/* Number of elements in *set_tid */
31+
size_t set_tid_size;
2932
};
3033

3134
/*

include/uapi/linux/sched.h

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,24 +39,38 @@
3939
#ifndef __ASSEMBLY__
4040
/**
4141
* struct clone_args - arguments for the clone3 syscall
42-
* @flags: Flags for the new process as listed above.
43-
* All flags are valid except for CSIGNAL and
44-
* CLONE_DETACHED.
45-
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
46-
* returned in this argument.
47-
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
48-
* child process will be returned in the child's
49-
* memory.
50-
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
51-
* the child process will be returned in the
52-
* parent's memory.
53-
* @exit_signal: The exit_signal the parent process will be
54-
* sent when the child exits.
55-
* @stack: Specify the location of the stack for the
56-
* child process.
57-
* @stack_size: The size of the stack for the child process.
58-
* @tls: If CLONE_SETTLS is set, the tls descriptor
59-
* is set to tls.
42+
* @flags: Flags for the new process as listed above.
43+
* All flags are valid except for CSIGNAL and
44+
* CLONE_DETACHED.
45+
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
46+
* returned in this argument.
47+
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
48+
* child process will be returned in the child's
49+
* memory.
50+
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
51+
* the child process will be returned in the
52+
* parent's memory.
53+
* @exit_signal: The exit_signal the parent process will be
54+
* sent when the child exits.
55+
* @stack: Specify the location of the stack for the
56+
* child process.
57+
* @stack_size: The size of the stack for the child process.
58+
* @tls: If CLONE_SETTLS is set, the tls descriptor
59+
* is set to tls.
60+
* @set_tid: Pointer to an array of type *pid_t. The size
61+
* of the array is defined using @set_tid_size.
62+
* This array is used to select PIDs/TIDs for
63+
* newly created processes. The first element in
64+
* this defines the PID in the most nested PID
65+
* namespace. Each additional element in the array
66+
* defines the PID in the parent PID namespace of
67+
* the original PID namespace. If the array has
68+
* less entries than the number of currently
69+
* nested PID namespaces only the PIDs in the
70+
* corresponding namespaces are set.
71+
* @set_tid_size: This defines the size of the array referenced
72+
* in @set_tid. This cannot be larger than the
73+
* kernel's limit of nested PID namespaces.
6074
*
6175
* The structure is versioned by size and thus extensible.
6276
* New struct members must go at the end of the struct and
@@ -71,10 +85,13 @@ struct clone_args {
7185
__aligned_u64 stack;
7286
__aligned_u64 stack_size;
7387
__aligned_u64 tls;
88+
__aligned_u64 set_tid;
89+
__aligned_u64 set_tid_size;
7490
};
7591
#endif
7692

7793
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
94+
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
7895

7996
/*
8097
* Scheduling policies

kernel/fork.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
20872087
stackleak_task_init(p);
20882088

20892089
if (pid != &init_struct_pid) {
2090-
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
2090+
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2091+
args->set_tid_size);
20912092
if (IS_ERR(pid)) {
20922093
retval = PTR_ERR(pid);
20932094
goto bad_fork_cleanup_thread;
@@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
25902591
{
25912592
int err;
25922593
struct clone_args args;
2594+
pid_t *kset_tid = kargs->set_tid;
25932595

25942596
if (unlikely(usize > PAGE_SIZE))
25952597
return -E2BIG;
@@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
26002602
if (err)
26012603
return err;
26022604

2605+
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2606+
return -EINVAL;
2607+
2608+
if (unlikely(!args.set_tid && args.set_tid_size > 0))
2609+
return -EINVAL;
2610+
2611+
if (unlikely(args.set_tid && args.set_tid_size == 0))
2612+
return -EINVAL;
2613+
26032614
/*
26042615
* Verify that higher 32bits of exit_signal are unset and that
26052616
* it is a valid signal
@@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
26172628
.stack = args.stack,
26182629
.stack_size = args.stack_size,
26192630
.tls = args.tls,
2631+
.set_tid_size = args.set_tid_size,
26202632
};
26212633

2634+
if (args.set_tid &&
2635+
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2636+
(kargs->set_tid_size * sizeof(pid_t))))
2637+
return -EFAULT;
2638+
2639+
kargs->set_tid = kset_tid;
2640+
26222641
return 0;
26232642
}
26242643

@@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
26622681
int err;
26632682

26642683
struct kernel_clone_args kargs;
2684+
pid_t set_tid[MAX_PID_NS_LEVEL];
2685+
2686+
kargs.set_tid = set_tid;
26652687

26662688
err = copy_clone_args_from_user(&kargs, uargs, size);
26672689
if (err)

kernel/pid.c

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
157157
call_rcu(&pid->rcu, delayed_put_pid);
158158
}
159159

160-
struct pid *alloc_pid(struct pid_namespace *ns)
160+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
161+
size_t set_tid_size)
161162
{
162163
struct pid *pid;
163164
enum pid_type type;
@@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
166167
struct upid *upid;
167168
int retval = -ENOMEM;
168169

170+
/*
171+
* set_tid_size contains the size of the set_tid array. Starting at
172+
* the most nested currently active PID namespace it tells alloc_pid()
173+
* which PID to set for a process in that most nested PID namespace
174+
* up to set_tid_size PID namespaces. It does not have to set the PID
175+
* for a process in all nested PID namespaces but set_tid_size must
176+
* never be greater than the current ns->level + 1.
177+
*/
178+
if (set_tid_size > ns->level + 1)
179+
return ERR_PTR(-EINVAL);
180+
169181
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
170182
if (!pid)
171183
return ERR_PTR(retval);
@@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
174186
pid->level = ns->level;
175187

176188
for (i = ns->level; i >= 0; i--) {
177-
int pid_min = 1;
189+
int tid = 0;
190+
191+
if (set_tid_size) {
192+
tid = set_tid[ns->level - i];
193+
194+
retval = -EINVAL;
195+
if (tid < 1 || tid >= pid_max)
196+
goto out_free;
197+
/*
198+
* Also fail if a PID != 1 is requested and
199+
* no PID 1 exists.
200+
*/
201+
if (tid != 1 && !tmp->child_reaper)
202+
goto out_free;
203+
retval = -EPERM;
204+
if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
205+
goto out_free;
206+
set_tid_size--;
207+
}
178208

179209
idr_preload(GFP_KERNEL);
180210
spin_lock_irq(&pidmap_lock);
181211

182-
/*
183-
* init really needs pid 1, but after reaching the maximum
184-
* wrap back to RESERVED_PIDS
185-
*/
186-
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
187-
pid_min = RESERVED_PIDS;
188-
189-
/*
190-
* Store a null pointer so find_pid_ns does not find
191-
* a partially initialized PID (see below).
192-
*/
193-
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
194-
pid_max, GFP_ATOMIC);
212+
if (tid) {
213+
nr = idr_alloc(&tmp->idr, NULL, tid,
214+
tid + 1, GFP_ATOMIC);
215+
/*
216+
* If ENOSPC is returned it means that the PID is
217+
* alreay in use. Return EEXIST in that case.
218+
*/
219+
if (nr == -ENOSPC)
220+
nr = -EEXIST;
221+
} else {
222+
int pid_min = 1;
223+
/*
224+
* init really needs pid 1, but after reaching the
225+
* maximum wrap back to RESERVED_PIDS
226+
*/
227+
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
228+
pid_min = RESERVED_PIDS;
229+
230+
/*
231+
* Store a null pointer so find_pid_ns does not find
232+
* a partially initialized PID (see below).
233+
*/
234+
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
235+
pid_max, GFP_ATOMIC);
236+
}
195237
spin_unlock_irq(&pidmap_lock);
196238
idr_preload_end();
197239

kernel/pid_namespace.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626

2727
static DEFINE_MUTEX(pid_caches_mutex);
2828
static struct kmem_cache *pid_ns_cachep;
29-
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
30-
#define MAX_PID_NS_LEVEL 32
3129
/* Write once array, filled from the beginning. */
3230
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
3331

0 commit comments

Comments
 (0)