11// SPDX-License-Identifier: GPL-2.0
22/*
3+ * Copyright (C) 2021 Benjamin Berg <[email protected] > 34 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
45 */
56
2425#include <kern_util.h>
2526#include <mem_user.h>
2627#include <ptrace_user.h>
28+ #include <stdbool.h>
29+ #include <stub-data.h>
30+ #include <sys/prctl.h>
31+ #include <linux/seccomp.h>
32+ #include <linux/filter.h>
33+ #include <sysdep/mcontext.h>
34+ #include <sysdep/stub.h>
2735#include <registers.h>
2836#include <skas.h>
2937#include "internal.h"
@@ -224,6 +232,140 @@ static void __init check_ptrace(void)
224232 check_sysemu ();
225233}
226234
235+ extern unsigned long host_fp_size ;
236+ extern unsigned long exec_regs [MAX_REG_NR ];
237+ extern unsigned long * exec_fp_regs ;
238+
239+ __initdata static struct stub_data * seccomp_test_stub_data ;
240+
241+ static void __init sigsys_handler (int sig , siginfo_t * info , void * p )
242+ {
243+ ucontext_t * uc = p ;
244+
245+ /* Stow away the location of the mcontext in the stack */
246+ seccomp_test_stub_data -> mctx_offset = (unsigned long )& uc -> uc_mcontext -
247+ (unsigned long )& seccomp_test_stub_data -> sigstack [0 ];
248+
249+ /* Prevent libc from clearing memory (mctx_offset in particular) */
250+ syscall (__NR_exit , 0 );
251+ }
252+
253+ static int __init seccomp_helper (void * data )
254+ {
255+ static struct sock_filter filter [] = {
256+ BPF_STMT (BPF_LD | BPF_W | BPF_ABS ,
257+ offsetof(struct seccomp_data , nr )),
258+ BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K , __NR_clock_nanosleep , 1 , 0 ),
259+ BPF_STMT (BPF_RET | BPF_K , SECCOMP_RET_ALLOW ),
260+ BPF_STMT (BPF_RET | BPF_K , SECCOMP_RET_TRAP ),
261+ };
262+ static struct sock_fprog prog = {
263+ .len = ARRAY_SIZE (filter ),
264+ .filter = filter ,
265+ };
266+ struct sigaction sa ;
267+
268+ set_sigstack (seccomp_test_stub_data -> sigstack ,
269+ sizeof (seccomp_test_stub_data -> sigstack ));
270+
271+ sa .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO ;
272+ sa .sa_sigaction = (void * ) sigsys_handler ;
273+ sa .sa_restorer = NULL ;
274+ if (sigaction (SIGSYS , & sa , NULL ) < 0 )
275+ exit (1 );
276+
277+ prctl (PR_SET_NO_NEW_PRIVS , 1 , 0 , 0 , 0 );
278+ if (syscall (__NR_seccomp , SECCOMP_SET_MODE_FILTER ,
279+ SECCOMP_FILTER_FLAG_TSYNC , & prog ) != 0 )
280+ exit (2 );
281+
282+ sleep (0 );
283+
284+ /* Never reached. */
285+ _exit (3 );
286+ }
287+
288+ static bool __init init_seccomp (void )
289+ {
290+ int pid ;
291+ int status ;
292+ int n ;
293+ unsigned long sp ;
294+
295+ /* doesn't work on 32-bit right now */
296+ if (!IS_ENABLED (CONFIG_64BIT ))
297+ return false;
298+
299+ /*
300+ * We check that we can install a seccomp filter and then exit(0)
301+ * from a trapped syscall.
302+ *
303+ * Note that we cannot verify that no seccomp filter already exists
304+ * for a syscall that results in the process/thread to be killed.
305+ */
306+
307+ os_info ("Checking that seccomp filters can be installed..." );
308+
309+ seccomp_test_stub_data = mmap (0 , sizeof (* seccomp_test_stub_data ),
310+ PROT_READ | PROT_WRITE ,
311+ MAP_SHARED | MAP_ANON , 0 , 0 );
312+
313+ /* Use the syscall data area as stack, we just need something */
314+ sp = (unsigned long )& seccomp_test_stub_data -> syscall_data +
315+ sizeof (seccomp_test_stub_data -> syscall_data ) -
316+ sizeof (void * );
317+ pid = clone (seccomp_helper , (void * )sp , CLONE_VFORK | CLONE_VM , NULL );
318+
319+ if (pid < 0 )
320+ fatal_perror ("check_seccomp : clone failed" );
321+
322+ CATCH_EINTR (n = waitpid (pid , & status , __WCLONE ));
323+ if (n < 0 )
324+ fatal_perror ("check_seccomp : waitpid failed" );
325+
326+ if (WIFEXITED (status ) && WEXITSTATUS (status ) == 0 ) {
327+ struct uml_pt_regs * regs ;
328+ unsigned long fp_size ;
329+ int r ;
330+
331+ /* Fill in the host_fp_size from the mcontext. */
332+ regs = calloc (1 , sizeof (struct uml_pt_regs ));
333+ get_stub_state (regs , seccomp_test_stub_data , & fp_size );
334+ host_fp_size = fp_size ;
335+ free (regs );
336+
337+ /* Repeat with the correct size */
338+ regs = calloc (1 , sizeof (struct uml_pt_regs ) + host_fp_size );
339+ r = get_stub_state (regs , seccomp_test_stub_data , NULL );
340+
341+ /* Store as the default startup registers */
342+ exec_fp_regs = malloc (host_fp_size );
343+ memcpy (exec_regs , regs -> gp , sizeof (exec_regs ));
344+ memcpy (exec_fp_regs , regs -> fp , host_fp_size );
345+
346+ munmap (seccomp_test_stub_data , sizeof (* seccomp_test_stub_data ));
347+
348+ free (regs );
349+
350+ if (r ) {
351+ os_info ("failed to fetch registers: %d\n" , r );
352+ return false;
353+ }
354+
355+ os_info ("OK\n" );
356+ return true;
357+ }
358+
359+ if (WIFEXITED (status ) && WEXITSTATUS (status ) == 2 )
360+ os_info ("missing\n" );
361+ else
362+ os_info ("error\n" );
363+
364+ munmap (seccomp_test_stub_data , sizeof (* seccomp_test_stub_data ));
365+ return false;
366+ }
367+
368+
227369static void __init check_coredump_limit (void )
228370{
229371 struct rlimit lim ;
@@ -278,6 +420,44 @@ void __init get_host_cpu_features(
278420 }
279421}
280422
423+ static int seccomp_config __initdata ;
424+
425+ static int __init uml_seccomp_config (char * line , int * add )
426+ {
427+ * add = 0 ;
428+
429+ if (strcmp (line , "off" ) == 0 )
430+ seccomp_config = 0 ;
431+ else if (strcmp (line , "auto" ) == 0 )
432+ seccomp_config = 1 ;
433+ else if (strcmp (line , "on" ) == 0 )
434+ seccomp_config = 2 ;
435+ else
436+ fatal ("Invalid seccomp option '%s', expected on/auto/off\n" ,
437+ line );
438+
439+ return 0 ;
440+ }
441+
442+ __uml_setup ("seccomp=" , uml_seccomp_config ,
443+ "seccomp=<on/auto/off>\n"
444+ " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
445+ " processes work collaboratively with the kernel instead of being\n"
446+ " traced using ptrace. All syscalls from the application are caught and\n"
447+ " redirected using a signal. This signal handler in turn is permitted to\n"
448+ " do the selected set of syscalls to communicate with the UML kernel and\n"
449+ " do the required memory management.\n"
450+ "\n"
451+ " This method is overall faster than the ptrace based userspace, primarily\n"
452+ " because it reduces the number of context switches for (minor) page faults.\n"
453+ "\n"
454+ " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
455+ " userspace from reading and writing all physical memory. Userspace\n"
456+ " processes could also trick the stub into disabling SIGALRM which\n"
457+ " prevents it from being interrupted for scheduling purposes.\n"
458+ "\n"
459+ " This is insecure and should only be used with a trusted userspace\n\n"
460+ );
281461
282462void __init os_early_checks (void )
283463{
@@ -286,13 +466,24 @@ void __init os_early_checks(void)
286466 /* Print out the core dump limits early */
287467 check_coredump_limit ();
288468
289- check_ptrace ();
290-
291469 /* Need to check this early because mmapping happens before the
292470 * kernel is running.
293471 */
294472 check_tmpexec ();
295473
474+ if (seccomp_config ) {
475+ if (init_seccomp ()) {
476+ using_seccomp = 1 ;
477+ return ;
478+ }
479+
480+ if (seccomp_config == 2 )
481+ fatal ("SECCOMP userspace requested but not functional!\n" );
482+ }
483+
484+ using_seccomp = 0 ;
485+ check_ptrace ();
486+
296487 pid = start_ptraced_child ();
297488 if (init_pid_registers (pid ))
298489 fatal ("Failed to initialize default registers" );
0 commit comments