|
Message-ID: <1e30ffd0-b35c-fcc2-ec8f-4495aefa7f6f@linux.com> Date: Thu, 22 Feb 2018 00:49:44 +0300 From: Alexander Popov <alex.popov@...ux.com> To: Borislav Petkov <bp@...en8.de> Cc: kernel-hardening@...ts.openwall.com, Kees Cook <keescook@...omium.org>, PaX Team <pageexec@...email.hu>, Brad Spengler <spender@...ecurity.net>, Ingo Molnar <mingo@...nel.org>, Andy Lutomirski <luto@...nel.org>, Tycho Andersen <tycho@...ho.ws>, Laura Abbott <labbott@...hat.com>, Mark Rutland <mark.rutland@....com>, Ard Biesheuvel <ard.biesheuvel@...aro.org>, Thomas Gleixner <tglx@...utronix.de>, "H . Peter Anvin" <hpa@...or.com>, Peter Zijlstra <a.p.zijlstra@...llo.nl>, "Dmitry V . Levin" <ldv@...linux.org>, x86@...nel.org Subject: Re: [PATCH RFC v8 1/6] x86/entry: Add STACKLEAK erasing the kernel stack at the end of syscalls Hello Borislav, Thanks a lot for your review. Please see my comments below. On 21.02.2018 16:24, Borislav Petkov wrote: > On Fri, Feb 16, 2018 at 09:10:52PM +0300, Alexander Popov wrote: >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig >> index 63bf349..62748bd 100644 >> --- a/arch/x86/Kconfig >> +++ b/arch/x86/Kconfig >> @@ -119,6 +119,7 @@ config X86 >> select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT >> select HAVE_ARCH_SECCOMP_FILTER >> select HAVE_ARCH_THREAD_STRUCT_WHITELIST >> + select HAVE_ARCH_STACKLEAK >> select HAVE_ARCH_TRACEHOOK >> select HAVE_ARCH_TRANSPARENT_HUGEPAGE >> select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 >> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S >> index 16c2c02..4a7365a 100644 >> --- a/arch/x86/entry/entry_32.S >> +++ b/arch/x86/entry/entry_32.S >> @@ -77,6 +77,90 @@ >> #endif >> .endm >> >> +.macro erase_kstack >> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK >> + call erase_kstack > > Hmm, why do we need a macro *and* a function of the same name? Why not do > > call erase_kstack > > everywhere we need to? > > And then do > > ENTRY(erase_kstack) > #ifdef CONFIG_GCC_PLUGIN_STACKLEAK > ... > > to tone down the ifdeffery. This approach doesn't work. There is the diff: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b418d3a..0a05755 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -66,14 +66,8 @@ END(native_usergs_sysret64) TRACE_IRQS_FLAGS EFLAGS(%rsp) .endm -.macro erase_kstack -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK - call erase_kstack -#endif -.endm - -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK ENTRY(erase_kstack) +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK pushq %rdi pushq %rcx pushq %rax @@ -173,8 +167,8 @@ ENTRY(erase_kstack) popq %rcx popq %rdi ret -ENDPROC(erase_kstack) #endif +ENDPROC(erase_kstack) /* * When dynamic function tracer is enabled it will add a breakpoint @@ -455,7 +449,7 @@ syscall_return_via_sysret: * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ - erase_kstack + call erase_kstack SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi @@ -765,7 +759,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ - erase_kstack + call erase_kstack SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi It crashes the kernel with STACKLEAK disabled: [ 1.709081] VFS: Mounted root (ext4 filesystem) readonly on device 8:0. [ 1.710202] devtmpfs: mounted [ 1.711741] Freeing unused kernel memory: 1276K [ 1.712391] Kernel memory protection disabled. [ 1.760050] PANIC: double fault, error_code: 0x0 [ 1.760627] CPU: 0 PID: 1 Comm: init Not tainted 4.16.0-rc1+ #8 [ 1.761005] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 1.761005] RIP: 0010:update_curr+0x7/0x160 [ 1.761005] RSP: 0000:fffffe0000002000 EFLAGS: 00010002 [ 1.761005] RAX: ffff88007c890100 RBX: ffff88007fc20900 RCX: 000000003da0c28c [ 1.761005] RDX: ffff88007fc20900 RSI: ffff88007c890100 RDI: ffff88007fc20900 [ 1.761005] RBP: ffff88007c890100 R08: ffffffffffffffda R09: 0000000000000001 [ 1.761005] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88007fc20900 [ 1.761005] R13: ffff88007b529d00 R14: ffff88007c890100 R15: 0000000000000000 [ 1.761005] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 [ 1.761005] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.761005] CR2: fffffe0000001ff8 CR3: 000000007b090000 CR4: 00000000000006f0 [ 1.761005] Call Trace: [ 1.761005] <ENTRY_TRAMPOLINE> [ 1.761005] put_prev_entity+0x31/0x580 [ 1.761005] pick_next_task_fair+0x454/0x470 [ 1.761005] __schedule+0x14b/0x6f0 [ 1.761005] schedule+0x23/0x80 [ 1.761005] exit_to_usermode_loop+0x5c/0x90 [ 1.761005] do_syscall_64+0xfa/0x110 [ 1.761005] entry_SYSCALL_64_after_hwframe+0x21/0x86 [ 1.761005] RIP: 81a00935:swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] RSP: 81a00935:ffffffff81a00935 EFLAGS: ffffffff81a00935 ORIG_RAX: ffffffffffffffda [ 1.761005] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000000000 [ 1.761005] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81a00935 [ 1.761005] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 1.761005] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 1.761005] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] ? swapgs_restore_regs_and_return_to_usermode+0x38/0x8c [ 1.761005] </ENTRY_TRAMPOLINE> [ 1.761005] Code: 02 00 00 8b 78 18 e8 29 ff ff ff 48 ba db 34 b6 d7 82 de 1b 43 48 f7 e2 48 89 d0 48 c1 e8 12 c3 0f 1f 40 00 41 56 41 55 41 54 55 <53> 48 8b 5f 40 48 8b 87 30 01 00 00 48 85 db 48 8b 80 68 09 00 [ 1.761005] Kernel panic - not syncing: Machine halted. Actually gcc creates a strange erase_kstack(): ffffffff81a00010 <erase_kstack>: ffffffff81a00010: 5f pop %rdi ffffffff81a00011: eb 31 jmp ffffffff81a00044 <entry_SYSCALL_64_after_hwframe> ffffffff81a00013: 0f 1f 00 nopl (%rax) ffffffff81a00016: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) ffffffff81a0001d: 00 00 00 But with the initial macro the binary doesn't have erase_kstack() at all when STACKLEAK is disabled. >> +#endif >> +.endm >> + >> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK >> +ENTRY(erase_kstack) >> + pushl %edi >> + pushl %ecx >> + pushl %eax >> + pushl %ebp >> + >> + movl PER_CPU_VAR(current_task), %ebp >> + mov TASK_lowest_stack(%ebp), %edi > > So instead of TASK_lowest_stack and adding all the ifdeffery around, why > not do this computation: > > (unsigned long)task_stack_page(p) + 2 * sizeof(unsigned long); > > in asm? > > You only need task->stack as an offset variable. And other code might > need it too, anyway so you could just as well use it instead of adding > another var to thread_struct. > > /me reads further. > > Ah. So this lowest_stack thing changes at the end: > > "Set the lowest_stack value to the top_of_stack - 256" > > Why? > > So that magic needs more explanation for slow people like me. Thanks for your question. The commit message says: "Full STACKLEAK feature also contains the gcc plugin which comes in a separate commit". And the next commit in this series introduces that plugin. Let me quote its commit message as well: This commit introduces the STACKLEAK gcc plugin. It is needed for: - tracking the lowest border of the kernel stack, which is important for the code erasing the used part of the kernel stack at the end of syscalls (comes in a separate commit); - checking that alloca calls don't cause stack overflow. So this plugin instruments the kernel code inserting: - the check_alloca() call before alloca and the track_stack() call after it; - the track_stack() call for the functions with a stack frame size greater than or equal to CONFIG_STACKLEAK_TRACK_MIN_SIZE. Tracking the lowest border of the kernel stack with the lowest_stack variable makes STACKLEAK so efficient (please see the performance statistics in the cover letter). >> + mov $STACKLEAK_POISON, %eax >> + std >> + >> + /* >> + * Let's search for the poison value in the stack. >> + * Start from the lowest_stack and go to the bottom (see std above). > > Let's write insns capitalized: "see STD above". Ok. > ... > >> + /* >> + * Check that some further qwords contain poison. If so, the part >> + * of the stack below the address in %rdi is likely to be poisoned. >> + * Otherwise we need to search deeper. >> + */ >> + mov $STACKLEAK_POISON_CHECK_DEPTH / 8, %ecx >> + repe scasq >> + jecxz 2f /* Poison the upper part of the stack */ >> + jne 1b /* Search deeper */ >> + >> +2: > > You could name that label > > .Lpoison > > and make the code more readable: > > jb .Lpoison > > and so on. Ok, thanks. >> + /* >> + * Two qwords at the bottom of the thread stack are reserved and >> + * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK). >> + */ >> + or $2 * 8, %rdi >> + >> + /* >> + * Check whether we are on the thread stack to prepare the counter >> + * for stack poisoning. >> + */ >> + mov PER_CPU_VAR(cpu_current_top_of_stack), %rcx >> + sub %rsp, %rcx >> + cmp $THREAD_SIZE_asm, %rcx >> + jb 3f >> + >> + /* >> + * We are not on the thread stack, so we can write poison between >> + * the address in %rdi and the stack top. >> + */ >> + mov PER_CPU_VAR(cpu_current_top_of_stack), %rcx >> + sub %rdi, %rcx >> + jmp 4f >> + >> +3: >> + /* >> + * We are on the thread stack, so we can write poison between the >> + * address in %rdi and the address in %rsp (not included, used memory). >> + */ >> + mov %rsp, %rcx >> + sub %rdi, %rcx >> + >> +4: >> + /* Check that the counter value is sane */ >> + cmp $THREAD_SIZE_asm, %rcx >> + jb 5f >> + ud2 >> + >> +5: >> + /* >> + * So let's write the poison value to the kernel stack. Start from the >> + * address in %rdi and move up (see cld). >> + */ >> + cld >> + shr $3, %ecx >> + rep stosq > > Hohumm, that's >2K loops per syscall here and stack is > > 0xffffc90000008010: 0xffffffffffff4111 0xffffffffffff4111 > 0xffffc90000008020: 0xffffffffffff4111 0xffffffffffff4111 > 0xffffc90000008030: 0xffffffffffff4111 0xffffffffffff4111 > 0xffffc90000008040: 0xffffffffffff4111 0xffffffffffff4111 > 0xffffc90000008050: 0xffffffffffff4111 0xffffffffffff4111 > 0xffffc90000008060: 0xffffffffffff4111 0xffffffffffff4111 Yes, so the stack is erased. That is the actual goal. > ... >> +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK >> +/* Poison value points to the unused hole in the virtual memory map */ > > There are a bunch of those there now. Please document it in > Documentation/x86/x86_64/mm.txt > >> +# define STACKLEAK_POISON -0xBEEF The mm.txt already has this line: ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole Excuse me, I didn't get what to document. Thanks again. Best regards, Alexander
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.