Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <4728eeae-8f1b-4541-b05a-4a0f35a459f7@www.fastmail.com>
Date: Fri, 26 Nov 2021 10:58:26 -0800
From: "Andy Lutomirski" <luto@...nel.org>
To: "Florian Weimer" <fweimer@...hat.com>, linux-arch@...r.kernel.org,
 "Linux API" <linux-api@...r.kernel.org>, linux-x86_64@...r.kernel.org,
 kernel-hardening@...ts.openwall.com
Cc: linux-mm@...ck.org, "the arch/x86 maintainers" <x86@...nel.org>,
 musl@...ts.openwall.com,
 "Dave Hansen via Libc-alpha" <libc-alpha@...rceware.org>,
 "Linux Kernel Mailing List" <linux-kernel@...r.kernel.org>,
 "Dave Hansen" <dave.hansen@...el.com>, "Kees Cook" <keescook@...omium.org>
Subject: Re: [PATCH] x86: Implement arch_prctl(ARCH_VSYSCALL_LOCKOUT) to disable
 vsyscall



On Fri, Nov 26, 2021, at 5:47 AM, Florian Weimer wrote:
> Distributions struggle with changing the default for vsyscall
> emulation because it is a clear break of userspace ABI, something
> that should not happen.
>
> The legacy vsyscall interface is supposed to be used by libcs only,
> not by applications.  This commit adds a new arch_prctl request,
> ARCH_VSYSCALL_LOCKOUT.  Newer libcs can adopt this request to signal
> to the kernel that the process does not need vsyscall emulation.
> The kernel can then disable it for the remaining lifetime of the
> process.  Legacy libcs do not perform this call, so vsyscall remains
> enabled for them.  This approach should achieves backwards
> compatibility (perfect compatibility if the assumption that only libcs
> use vsyscall is accurate), and it provides full hardening for new
> binaries.

Why is a lockout needed instead of just a toggle?  By the time an attacker can issue prctls, an emulated vsyscall seems like a pretty minor exploit technique.  And programs that load legacy modules or instrument other programs might need to re-enable them.

Also, the interaction with emulate mode is somewhat complex. For now, let’s support this in xonly mode only. A complete implementation will require nontrivial mm work.  I had that implemented pre-KPTI, but KPTI made it more complicated.

Finally, /proc/self/maps should be wired up via the gate_area code.

>
> The chosen value of ARCH_VSYSCALL_LOCKOUT should avoid conflicts
> with outher x86-64 arch_prctl requests.
>
> Future arch_prctls requests commonly used at process startup can imply
> vsyscall lockout, so that a separate system call for the lockout is
> not needed.
>
> Signed-off-by: Florian Weimer <fweimer@...hat.com>
>
> ---
>  arch/x86/entry/vsyscall/vsyscall_64.c          |   6 +
>  arch/x86/include/asm/mmu.h                     |   6 +
>  arch/x86/include/uapi/asm/prctl.h              |   2 +
>  arch/x86/kernel/process_64.c                   |   5 +
>  tools/arch/x86/include/uapi/asm/prctl.h        |   2 +
>  tools/testing/selftests/x86/Makefile           |  13 +-
>  tools/testing/selftests/x86/vsyscall_lockout.c | 431 +++++++++++++++++++++++++
>  7 files changed, 462 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c 
> b/arch/x86/entry/vsyscall/vsyscall_64.c
> index 0b6b277ee050..ac176481cbdf 100644
> --- a/arch/x86/entry/vsyscall/vsyscall_64.c
> +++ b/arch/x86/entry/vsyscall/vsyscall_64.c
> @@ -174,6 +174,12 @@ bool emulate_vsyscall(unsigned long error_code,
> 
>  	tsk = current;
> 
> +	if (tsk->mm->context.vsyscall_lockout) {
> +		warn_bad_vsyscall(KERN_WARNING, regs,
> +				  "vsyscall after lockout (exploit attempt?)");
> +		goto sigsegv;
> +	}
> +
>  	/*
>  	 * Check for access_ok violations and find the syscall nr.
>  	 *
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 5d7494631ea9..59ddac5ad2e7 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -41,6 +41,12 @@ typedef struct {
>  #ifdef CONFIG_X86_64
>  	unsigned short flags;
>  #endif
> +#ifdef CONFIG_X86_VSYSCALL_EMULATION
> +	/*
> +	 * Set to true by arch_prctl(ARCH_VSYSCALL_LOCKOUT).
> +	 */
> +	bool vsyscall_lockout;
> +#endif
> 
>  	struct mutex lock;
>  	void __user *vdso;			/* vdso base address */
> diff --git a/arch/x86/include/uapi/asm/prctl.h 
> b/arch/x86/include/uapi/asm/prctl.h
> index 754a07856817..6f2b17ec4798 100644
> --- a/arch/x86/include/uapi/asm/prctl.h
> +++ b/arch/x86/include/uapi/asm/prctl.h
> @@ -18,4 +18,6 @@
>  #define ARCH_MAP_VDSO_32	0x2002
>  #define ARCH_MAP_VDSO_64	0x2003
> 
> +#define ARCH_VSYSCALL_LOCKOUT	0x5001
> +
>  #endif /* _ASM_X86_PRCTL_H */
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 3402edec236c..eaabd365aa63 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -816,6 +816,11 @@ long do_arch_prctl_64(struct task_struct *task, 
> int option, unsigned long arg2)
>  		ret = put_user(base, (unsigned long __user *)arg2);
>  		break;
>  	}
> +	case ARCH_VSYSCALL_LOCKOUT:
> +#ifdef CONFIG_X86_VSYSCALL_EMULATION
> +		current->mm->context.vsyscall_lockout = true;
> +#endif
> +		break;
> 
>  #ifdef CONFIG_CHECKPOINT_RESTORE
>  # ifdef CONFIG_X86_X32_ABI
> diff --git a/tools/arch/x86/include/uapi/asm/prctl.h 
> b/tools/arch/x86/include/uapi/asm/prctl.h
> index 754a07856817..6f2b17ec4798 100644
> --- a/tools/arch/x86/include/uapi/asm/prctl.h
> +++ b/tools/arch/x86/include/uapi/asm/prctl.h
> @@ -18,4 +18,6 @@
>  #define ARCH_MAP_VDSO_32	0x2002
>  #define ARCH_MAP_VDSO_64	0x2003
> 
> +#define ARCH_VSYSCALL_LOCKOUT	0x5001
> +
>  #endif /* _ASM_X86_PRCTL_H */
> diff --git a/tools/testing/selftests/x86/Makefile 
> b/tools/testing/selftests/x86/Makefile
> index 8a1f62ab3c8e..2269429b77e0 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 
> test_syscall_vdso unwind_vdso \
>  			test_FCMOV test_FCOMI test_FISTTP \
>  			vdso_restorer
>  TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \
> -			corrupt_xstate_header amx
> +			corrupt_xstate_header amx vsyscall_lockout
>  # Some selftests require 32bit support enabled also on 64bit systems
>  TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
> 
> @@ -72,10 +72,12 @@ all_64: $(BINARIES_64)
>  EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64)
> 
>  $(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h
> -	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
> +	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \
> +		$(or $(LIBS), -lrt -ldl -lm)
> 
>  $(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h
> -	$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
> +	$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ \
> +		$(or $(LIBS), -lrt -ldl -lm)
> 
>  # x86_64 users should be encouraged to install 32-bit libraries
>  ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
> @@ -105,3 +107,8 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
>  # state.
>  $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start 
> -static
>  $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start 
> -static
> +
> +# This test does not link against anything (neither libc nor libgcc).
> +$(OUTPUT)/vsyscall_lockout_64: \
> +	LIBS := -Wl,-no-pie -static -nostdlib -nostartfiles
> +	CFLAGS += -fno-pie -fno-stack-protector -fno-builtin -ffreestanding
> diff --git a/tools/testing/selftests/x86/vsyscall_lockout.c 
> b/tools/testing/selftests/x86/vsyscall_lockout.c
> new file mode 100644
> index 000000000000..88669b4907ee
> --- /dev/null
> +++ b/tools/testing/selftests/x86/vsyscall_lockout.c
> @@ -0,0 +1,431 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * vsyscall_lockout.c - check that disabling vsyscall works
> + * Copyright (C) 2021 Red Hat, Inc.
> + */
> +
> +#include <stddef.h>
> +
> +#include <asm/prctl.h>
> +#include <asm/vsyscall.h>
> +#include <linux/signal.h>
> +#include <linux/time.h>
> +#include <linux/types.h>
> +#include <linux/unistd.h>
> +
> +#ifndef ARCH_VSYSCALL_LOCKOUT
> +#define ARCH_VSYSCALL_LOCKOUT   0x5001
> +#elif ARCH_VSYSCALL_LOCKOUT != 0x5001
> +#error wrong vlaue for ARCH_VSYSCALL_LOCKOUT
> +#endif
> +
> +static inline long syscall0(int nr)
> +{
> +        unsigned long result;
> +
> +        __asm__ volatile ("syscall"
> +                          : "=a" (result)
> +                          : "0" (nr)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static inline long syscall1(int nr, long arg0)
> +{
> +        register long rdi __asm__ ("rdi") = arg0;
> +        unsigned long result;
> +
> +        __asm__ volatile ("syscall"
> +                          : "=a" (result)
> +                          : "0" (nr), "r" (rdi)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static inline long syscall2(int nr, long arg0, long arg1)
> +{
> +        register long rdi __asm__ ("rdi") = arg0;
> +        register long rsi __asm__ ("rsi") = arg1;
> +        unsigned long result;
> +
> +        __asm__ volatile ("syscall"
> +                          : "=a" (result)
> +                          : "0" (nr), "r" (rdi), "r" (rsi)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static inline long syscall3(int nr, long arg0, long arg1, long arg2)
> +{
> +        register long rdi __asm__ ("rdi") = arg0;
> +        register long rsi __asm__ ("rsi") = arg1;
> +        register long rdx __asm__ ("rdx") = arg2;
> +        unsigned long result;
> +
> +        __asm__ volatile ("syscall"
> +                          : "=a" (result)
> +                          : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static inline long syscall4(int nr, long arg0, long arg1, long arg2, 
> long arg3)
> +{
> +        register long rdi __asm__ ("rdi") = arg0;
> +        register long rsi __asm__ ("rsi") = arg1;
> +        register long rdx __asm__ ("rdx") = arg2;
> +        register long r10 __asm__ ("r10") = arg2;
> +        unsigned long result;
> +
> +        __asm__ volatile ("syscall"
> +                          : "=a" (result)
> +                          : "0" (nr), "r" (rdi), "r" (rsi), "r" (rdx),
> +                            "r" (r10)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static inline long vsyscall1(long addr, long arg0)
> +{
> +        register long rdi __asm__ ("rdi") = arg0;
> +        unsigned long result;
> +
> +        __asm__ volatile ("callq *%%rax"
> +                          : "=a" (result)
> +                          : "0" (addr), "r" (rdi)
> +                          : "memory", "cc", "r11", "cx");
> +        return result;
> +}
> +
> +static void __attribute__ ((noreturn)) sys_exit(int status)
> +{
> +        syscall1(__NR_exit, status);
> +        __builtin_unreachable();
> +}
> +
> +static void sigabrt(void)
> +{
> +        syscall2(__NR_kill, syscall0(__NR_getpid), SIGABRT);
> +}
> +
> +static void print_char(char byte)
> +{
> +        if (syscall3(__NR_write, 1L, (long) &byte, 1L) < 0)
> +                sigabrt();
> +}
> +
> +static void print_string(const char *p)
> +{
> +        while (*p) {
> +                print_char(*p);
> +                ++p;
> +        }
> +}
> +
> +static void print_dec_1(unsigned long val)
> +{
> +        if (val != 0) {
> +                print_dec_1(val / 10);
> +                print_char('0' + (val % 10));
> +        }
> +}
> +
> +static void print_dec(unsigned long val)
> +{
> +        if (val == 0)
> +                print_char('0');
> +        else
> +                print_dec_1(val);
> +}
> +
> +static void print_signed_dec(long val)
> +{
> +        if (val < 0) {
> +                print_char('-');
> +                print_dec(-(unsigned long)val);
> +        } else
> +                print_dec(val);
> +}
> +
> +static void print_time(const char *label, struct timeval tv)
> +{
> +        print_string(label);
> +        print_string(": ");
> +        print_dec(tv.tv_sec);
> +        print_char(' ');
> +        print_dec(tv.tv_usec);
> +        print_char('\n');
> +}
> +
> +static void print_failure(const char *label, long ret)
> +{
> +        print_string("error: ");
> +        print_string(label);
> +        print_string(" failed: ");
> +        print_signed_dec(ret);
> +        print_char('\n');
> +}
> +
> +static void xgettimeofday(struct timeval *tv)
> +{
> +        long ret = syscall1(__NR_gettimeofday, (long) tv);
> +
> +        if (ret != 0) {
> +                print_failure("gettimeofday", ret);
> +                sigabrt();
> +        }
> +}
> +
> +static void xvgettimeofday(struct timeval *tv)
> +{
> +        long ret = vsyscall1(VSYSCALL_ADDR, (long) tv);
> +
> +        if (ret) {
> +                print_failure("vgettimeofday", ret);
> +                sigabrt();
> +        }
> +}
> +
> +static int sys_arch_prctl(int code, unsigned long addr)
> +{
> +        return syscall2(__NR_arch_prctl, code, addr);
> +}
> +
> +static __kernel_pid_t xfork(void)
> +{
> +        long ret = syscall0(__NR_fork);
> +
> +        if (ret < 0) {
> +                print_failure("fork", ret);
> +                sigabrt();
> +        }
> +        return ret;
> +}
> +
> +static void xexecve(const char *pathname, char **argv, char **envp)
> +{
> +        long ret;
> +
> +        ret = syscall3(__NR_execve, (long) pathname, (long) argv, 
> (long) envp);
> +        print_failure("execve", ret);
> +        sigabrt();
> +}
> +
> +static __kernel_pid_t xwaitpid(__kernel_pid_t pid, int *status, int 
> options)
> +{
> +        long ret = syscall4(__NR_wait4, pid, (long) status, options, 
> 0);
> +
> +        if (ret < 0) {
> +                print_failure("wait4", ret);
> +                sigabrt();
> +        }
> +        return ret;
> +}
> +
> +static int
> +do_lockout(void)
> +{
> +        long ret = sys_arch_prctl(ARCH_VSYSCALL_LOCKOUT, 0);
> +        if (ret < 0)
> +                print_failure("arch_prctl(ARCH_VSYSCALL_LOCKOUT)", 
> ret);
> +        return ret;
> +}
> +
> +static long difftime(struct timeval first, struct timeval second)
> +{
> +        return second.tv_usec - first.tv_usec +
> +                (second.tv_sec - first.tv_sec) * 1000 * 1000;
> +}
> +
> +/*
> + * Second stage: Check that the lockout is not inherited across execve.
> + */
> +static int main_2(void)
> +{
> +        struct timeval vsyscall_time = { -1, -1 };
> +        int status = 0;
> +
> +        xvgettimeofday(&vsyscall_time);
> +        print_time("vsyscall gettimeofday after fork", vsyscall_time);
> +        if (vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0)
> +                status = 1;
> +
> +        return status;
> +}
> +
> +static void check_lockout_after_fork(int *status, int twice)
> +{
> +        __kernel_pid_t pid;
> +        struct timeval vsyscall_time;
> +        int wstatus;
> +
> +        if (twice) {
> +                __kernel_pid_t pid_outer;
> +
> +                print_string("checking that lockout is inherited by 
> fork\n");
> +
> +                pid_outer = xfork();
> +                if (pid_outer == 0) {
> +                        if (do_lockout())
> +                                sys_exit(1);
> +                        /*
> +                         * Logic for the subprocess follows below.
> +                         */
> +                } else {
> +                        xwaitpid(pid_outer, &wstatus, 0);
> +                        if (wstatus != 0) {
> +                                print_string("error: unexpected exit 
> status: ");
> +                                print_signed_dec(wstatus);
> +                                print_char('\n');
> +                                *status = 1;
> +                        }
> +                        return;
> +                }
> +        } else
> +                print_string("checking that lockout works after one 
> fork\n");
> +
> +        pid = xfork();
> +        if (pid == 0) {
> +                if (!twice && do_lockout())
> +                        sys_exit(1);
> +                /*
> +                 * This should trigger a fault.
> +                 */
> +                xvgettimeofday(&vsyscall_time);
> +                sys_exit(0);
> +        }
> +        xwaitpid(pid, &wstatus, 0);
> +        switch (wstatus) {
> +        case 0:
> +                print_string("error: no crash after lockout\n");
> +                *status = 1;
> +                break;
> +        case 0x0100:
> +                *status = 1;
> +                break;
> +        case SIGSEGV:
> +                print_string("termination after lockout\n");
> +                break;
> +        default:
> +                print_string("error: unexpected exit status: ");
> +                print_signed_dec(wstatus);
> +                print_char('\n');
> +                *status = 1;
> +        }
> +
> +        if (twice)
> +                sys_exit(*status);
> +
> +        /*
> +         * Status in the parent process should be unaffected.
> +         */
> +        xvgettimeofday(&vsyscall_time);
> +}
> +
> +static void check_no_lockout_after_execve(char **argv, int *status)
> +{
> +        __kernel_pid_t pid;
> +        int wstatus;
> +
> +        print_string("checking that lockout is not inherited by 
> execve\n");
> +        pid = xfork();
> +        if (pid == 0) {
> +                struct timeval vsyscall_time;
> +                char *new_argv[] = { argv[0], "2", NULL };
> +
> +                xvgettimeofday(&vsyscall_time);
> +                if (do_lockout())
> +                        sys_exit(1);
> +
> +                /*
> +                 * Re-exec the second stage.  See main_2 above.
> +                 */
> +                xexecve(argv[0], new_argv, new_argv + 2);
> +        }
> +
> +        xwaitpid(pid, &wstatus, 0);
> +        if (wstatus != 0) {
> +                print_string("error: unexpected exit status: ");
> +                print_signed_dec(wstatus);
> +                print_char('\n');
> +                *status = 1;
> +        }
> +}
> +
> +static int main(int argc, char **argv)
> +{
> +        struct timeval initial_time = { -1, -1 };
> +        struct timeval vsyscall_time = { -1, -1 };
> +        struct timeval final_time = { -1, -1 };
> +        long vsyscall_diff, final_diff;
> +        int status = 0;
> +
> +        if (argc > 1)
> +                switch (*argv[1]) {
> +                case '2':
> +                        return main_2();
> +                default:
> +                        print_string("usage: ");
> +                        print_string(argv[0]);
> +                        print_string("\n");
> +                        return 1;
> +                }
> +
> +
> +        xgettimeofday(&initial_time);
> +        xvgettimeofday(&vsyscall_time);
> +        xgettimeofday(&final_time);
> +        vsyscall_diff = difftime(initial_time, vsyscall_time);
> +        final_diff = difftime(vsyscall_time, final_time);
> +
> +        print_time("initial gettimeofday", initial_time);
> +        print_time("vsyscall gettimeofday", vsyscall_time);
> +        print_time("final gettimeofday", final_time);
> +
> +        if (initial_time.tv_sec < 0 || initial_time.tv_usec < 0 ||
> +            vsyscall_time.tv_sec < 0 || vsyscall_time.tv_usec < 0 ||
> +            final_time.tv_sec < 0 || final_time.tv_usec < 0) {
> +                print_string("error: negative time\n");
> +                status = 1;
> +        }
> +
> +        print_string("differences: ");
> +        print_signed_dec(vsyscall_diff);
> +        print_char(' ');
> +        print_signed_dec(final_diff);
> +        print_char('\n');
> +
> +        if (vsyscall_diff < 0 || final_diff < 0) {
> +                /*
> +                 * This may produce false positives if there is an 
> active NTP.
> +                 */
> +                print_string("error: time went backwards\n");
> +                status = 1;
> +        }
> +
> +        check_lockout_after_fork(&status, 0);
> +        check_lockout_after_fork(&status, 1);
> +        check_no_lockout_after_execve(argv, &status);
> +
> +        print_string("testing done, exit status: ");
> +        print_signed_dec(status);
> +        print_char('\n');
> +        return status;
> +}
> +
> +static void __attribute__ ((used)) main_trampoline(long *rsp)
> +{
> +        sys_exit(main(*rsp, (char **) (rsp + 1)));
> +}
> +
> +__asm__ (".text\n\t"
> +         ".globl _start\n"
> +         "_start:\n\t"
> +         ".cfi_startproc\n\t"
> +         ".cfi_undefined rip\n\t"
> +         "movq %rsp, %rdi\n\t"
> +         "callq main_trampoline\n\t" /* Results in psABI %rsp 
> alignment.  */
> +         ".cfi_endproc\n\t"
> +         ".type _start, @function\n\t"
> +         ".size _start, . - _start\n\t"
> +         ".previous");

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.