|
Message-ID: <CAMzpN2ggqbwkfNjHtYg3KU5oeerU6NPa0AqzG7PFtJuWy3dyKA@mail.gmail.com> Date: Mon, 27 Jun 2016 11:12:41 -0400 From: Brian Gerst <brgerst@...il.com> To: Andy Lutomirski <luto@...nel.org> Cc: "the arch/x86 maintainers" <x86@...nel.org>, Linux Kernel Mailing List <linux-kernel@...r.kernel.org>, linux-arch <linux-arch@...r.kernel.org>, Borislav Petkov <bp@...en8.de>, Nadav Amit <nadav.amit@...il.com>, Kees Cook <keescook@...omium.org>, "kernel-hardening@...ts.openwall.com" <kernel-hardening@...ts.openwall.com>, Linus Torvalds <torvalds@...ux-foundation.org>, Josh Poimboeuf <jpoimboe@...hat.com>, Jann Horn <jann@...jh.net>, Heiko Carstens <heiko.carstens@...ibm.com> Subject: Re: [PATCH v4 15/29] x86/mm/64: Enable vmapped stacks On Mon, Jun 27, 2016 at 11:01 AM, Brian Gerst <brgerst@...il.com> wrote: > On Sun, Jun 26, 2016 at 5:55 PM, Andy Lutomirski <luto@...nel.org> wrote: >> This allows x86_64 kernels to enable vmapped stacks. There are a >> couple of interesting bits. >> >> First, x86 lazily faults in top-level paging entries for the vmalloc >> area. This won't work if we get a page fault while trying to access >> the stack: the CPU will promote it to a double-fault and we'll die. >> To avoid this problem, probe the new stack when switching stacks and >> forcibly populate the pgd entry for the stack when switching mms. >> >> Second, once we have guard pages around the stack, we'll want to >> detect and handle stack overflow. >> >> I didn't enable it on x86_32. We'd need to rework the double-fault >> code a bit and I'm concerned about running out of vmalloc virtual >> addresses under some workloads. >> >> This patch, by itself, will behave somewhat erratically when the >> stack overflows while RSP is still more than a few tens of bytes >> above the bottom of the stack. Specifically, we'll get #PF and make >> it to no_context and an oops without triggering a double-fault, and >> no_context doesn't know about stack overflows. The next patch will >> improve that case. >> >> Signed-off-by: Andy Lutomirski <luto@...nel.org> >> --- >> arch/x86/Kconfig | 1 + >> arch/x86/include/asm/switch_to.h | 28 +++++++++++++++++++++++++++- >> arch/x86/kernel/traps.c | 32 ++++++++++++++++++++++++++++++++ >> arch/x86/mm/tlb.c | 15 +++++++++++++++ >> 4 files changed, 75 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >> index d9a94da0c29f..afdcf96ef109 100644 >> --- a/arch/x86/Kconfig >> +++ b/arch/x86/Kconfig >> @@ -92,6 +92,7 @@ config X86 >> select HAVE_ARCH_TRACEHOOK >> select HAVE_ARCH_TRANSPARENT_HUGEPAGE >> select HAVE_EBPF_JIT if X86_64 >> + select HAVE_ARCH_VMAP_STACK if X86_64 >> select HAVE_CC_STACKPROTECTOR >> select HAVE_CMPXCHG_DOUBLE >> select HAVE_CMPXCHG_LOCAL >> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h >> index 8f321a1b03a1..14e4b20f0aaf 100644 >> --- a/arch/x86/include/asm/switch_to.h >> +++ b/arch/x86/include/asm/switch_to.h >> @@ -8,6 +8,28 @@ struct tss_struct; >> void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, >> struct tss_struct *tss); >> >> +/* This runs runs on the previous thread's stack. */ >> +static inline void prepare_switch_to(struct task_struct *prev, >> + struct task_struct *next) >> +{ >> +#ifdef CONFIG_VMAP_STACK >> + /* >> + * If we switch to a stack that has a top-level paging entry >> + * that is not present in the current mm, the resulting #PF will >> + * will be promoted to a double-fault and we'll panic. Probe >> + * the new stack now so that vmalloc_fault can fix up the page >> + * tables if needed. This can only happen if we use a stack >> + * in vmap space. >> + * >> + * We assume that the stack is aligned so that it never spans >> + * more than one top-level paging entry. >> + * >> + * To minimize cache pollution, just follow the stack pointer. >> + */ >> + READ_ONCE(*(unsigned char *)next->thread.sp); >> +#endif >> +} >> + >> #ifdef CONFIG_X86_32 >> >> #ifdef CONFIG_CC_STACKPROTECTOR >> @@ -39,6 +61,8 @@ do { \ >> */ \ >> unsigned long ebx, ecx, edx, esi, edi; \ >> \ >> + prepare_switch_to(prev, next); \ >> + \ >> asm volatile("pushl %%ebp\n\t" /* save EBP */ \ >> "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ >> "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ >> @@ -103,7 +127,9 @@ do { \ >> * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL >> * has no effect. >> */ >> -#define switch_to(prev, next, last) \ >> +#define switch_to(prev, next, last) \ >> + prepare_switch_to(prev, next); \ >> + \ >> asm volatile(SAVE_CONTEXT \ >> "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ >> "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ >> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c >> index 00f03d82e69a..9cb7ea781176 100644 >> --- a/arch/x86/kernel/traps.c >> +++ b/arch/x86/kernel/traps.c >> @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) >> DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) >> DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) >> >> +#ifdef CONFIG_VMAP_STACK >> +static void __noreturn handle_stack_overflow(const char *message, >> + struct pt_regs *regs, >> + unsigned long fault_address) >> +{ >> + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", >> + (void *)fault_address, current->stack, >> + (char *)current->stack + THREAD_SIZE - 1); >> + die(message, regs, 0); >> + >> + /* Be absolutely certain we don't return. */ >> + panic(message); >> +} >> +#endif >> + >> #ifdef CONFIG_X86_64 >> /* Runs on IST stack */ >> dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) >> { >> static const char str[] = "double fault"; >> struct task_struct *tsk = current; >> +#ifdef CONFIG_VMAP_STACK >> + unsigned long cr2; >> +#endif >> >> #ifdef CONFIG_X86_ESPFIX64 >> extern unsigned char native_irq_return_iret[]; >> @@ -332,6 +350,20 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) >> tsk->thread.error_code = error_code; >> tsk->thread.trap_nr = X86_TRAP_DF; >> >> +#ifdef CONFIG_VMAP_STACK >> + /* >> + * If we overflow the stack into a guard page, the CPU will fail >> + * to deliver #PF and will send #DF instead. CR2 will contain >> + * the linear address of the second fault, which will be in the >> + * guard page below the bottom of the stack. >> + */ >> + cr2 = read_cr2(); >> + if ((unsigned long)tsk->stack - 1 - cr2 < PAGE_SIZE) >> + handle_stack_overflow( >> + "kernel stack overflow (double-fault)", >> + regs, cr2); >> +#endif > > Is there any other way to tell if this was from a page fault? If it > wasn't a page fault then CR2 is undefined. I guess it doesn't really matter, since the fault is fatal either way. The error message might be incorrect though. -- Brian Gerst
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.