Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2f24df9e-64b1-ffef-ddbb-08f7fe8d2a96@redhat.com>
Date: Fri, 14 Jul 2017 13:51:24 -0700
From: Laura Abbott <labbott@...hat.com>
To: Mark Rutland <mark.rutland@....com>
Cc: Kees Cook <keescook@...omium.org>, Alex Popov <alex.popov@...ux.com>,
 kernel-hardening@...ts.openwall.com,
 Ard Biesheuvel <ard.biesheuvel@...aro.org>
Subject: Re: [RFC][PATCH 2/2] arm64: Clear the stack

On 07/11/2017 12:51 PM, Mark Rutland wrote:
> Hi Laura,
> 
> On Mon, Jul 10, 2017 at 03:04:43PM -0700, Laura Abbott wrote:
>> Implementation of stackleak based heavily on the x86 version
> 
> Nice!
> 
>> Signed-off-by: Laura Abbott <labbott@...hat.com>
>> ---
>> The biggest points that need review here:
>> - Is the extra offsetting (subtracting/adding from the stack) correct?
> 
> I think it's a little off; more on that below.
> 
>> - Where else do we need to clear the stack?
> 
> I guess we might need to clear (all of the remainder of) the stack after
> invoking EFI runtime services -- those can run in task context, might
> leave sensitive values on the stack, and they're uninstrumented. The
> same would apply for x86.
> 
> I think we can ignore garbage left on the stack by idle/hotplug, since
> that happens in the idle thread, so we shouldn't be doing uaccess
> transfers on those stacks.
> 
>> - The assembly can almost certainly be optimized. I tried to keep the
>>   behavior of the x86 'repe scasq' and the like where possible. I'm also a
>>   terrible register allocator.
> 
> I tried to steer clear of code golf here, but I didn't entirely manage.
> 
> I also don't know x86 asm, so it's very possible I've just managed to
> confuse myself.
> 

I know enough x86 asm to confuse myself as you can see below

>> ---
>>  arch/arm64/Kconfig                    |  1 +
>>  arch/arm64/include/asm/processor.h    |  3 ++
>>  arch/arm64/kernel/asm-offsets.c       |  3 ++
>>  arch/arm64/kernel/entry.S             | 92 +++++++++++++++++++++++++++++++++++
>>  arch/arm64/kernel/process.c           | 18 +++++++
>>  drivers/firmware/efi/libstub/Makefile |  3 +-
>>  scripts/Makefile.gcc-plugins          |  5 +-
>>  7 files changed, 123 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index 8addb85..0b65bfc 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -17,6 +17,7 @@ config ARM64
>>  	select ARCH_HAS_KCOV
>>  	select ARCH_HAS_SET_MEMORY
>>  	select ARCH_HAS_SG_CHAIN
>> +	select ARCH_HAS_STACKLEAK
>>  	select ARCH_HAS_STRICT_KERNEL_RWX
>>  	select ARCH_HAS_STRICT_MODULE_RWX
>>  	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
>> index 64c9e78..76f2738 100644
>> --- a/arch/arm64/include/asm/processor.h
>> +++ b/arch/arm64/include/asm/processor.h
>> @@ -88,6 +88,9 @@ struct thread_struct {
>>  	unsigned long		fault_address;	/* fault info */
>>  	unsigned long		fault_code;	/* ESR_EL1 value */
>>  	struct debug_info	debug;		/* debugging */
>> +#ifdef CONFIG_STACKLEAK
>> +	unsigned long           lowest_stack;
>> +#endif
>>  };
>>  
>>  #ifdef CONFIG_COMPAT
>> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
>> index b3bb7ef..e0a5ae2 100644
>> --- a/arch/arm64/kernel/asm-offsets.c
>> +++ b/arch/arm64/kernel/asm-offsets.c
>> @@ -43,6 +43,9 @@ int main(void)
>>    DEFINE(TSK_TI_TTBR0,		offsetof(struct task_struct, thread_info.ttbr0));
>>  #endif
>>    DEFINE(TSK_STACK,		offsetof(struct task_struct, stack));
>> +#ifdef CONFIG_STACKLEAK
>> +  DEFINE(TSK_TI_LOWEST_STACK,	offsetof(struct task_struct, thread.lowest_stack));
>> +#endif
>>    BLANK();
>>    DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
>>    BLANK();
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index b738880..e573633 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -744,6 +744,7 @@ ENDPROC(cpu_switch_to)
>>   */
>>  ret_fast_syscall:
>>  	disable_irq				// disable interrupts
>> +	bl	erase_kstack
>>  	str	x0, [sp, #S_X0]			// returned x0
>>  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
>>  	and	x2, x1, #_TIF_SYSCALL_WORK
>> @@ -772,6 +773,7 @@ work_pending:
>>   */
>>  ret_to_user:
>>  	disable_irq				// disable interrupts
>> +	bl	erase_kstack
>>  	ldr	x1, [tsk, #TSK_TI_FLAGS]
>>  	and	x2, x1, #_TIF_WORK_MASK
>>  	cbnz	x2, work_pending
>> @@ -865,3 +867,93 @@ ENTRY(sys_rt_sigreturn_wrapper)
>>  	mov	x0, sp
>>  	b	sys_rt_sigreturn
>>  ENDPROC(sys_rt_sigreturn_wrapper)
>> +
>> +#ifdef CONFIG_STACKLEAK
>> +ENTRY(erase_kstack)
>> +	/* save x0 for the fast path */
>> +	mov	x10, x0
>> +
>> +	get_thread_info	x0
>> +	ldr	x1, [x0, #TSK_TI_LOWEST_STACK]
>> +
>> +	/* get the number of bytes to check for lowest stack */
>> +	mov	x3, x1
>> +	and	x3, x3, #THREAD_SIZE - 1
>> +	lsr	x3, x3, #3
>> +
>> +	/* now generate the start of the stack */
>> +	mov	x4, sp
>> +	movn	x2, #THREAD_SIZE - 1
>> +	and	x1, x4, x2
>> +
>> +	mov	x2, #-0xBEEF	/* stack poison */
>> +
>> +	cmp     x3, #0
>> +	b.eq    4f /* Found nothing, go poison */
>> +
>> +1:
>> +	ldr     x4, [x1, x3, lsl #3]
>> +	cmp     x4, x2  /* did we find the poison? */
>> +	b.eq    2f /* yes we did, go check it */
>> +
>> +	sub     x3, x3, #1
>> +	cmp     x3, #0
>> +	b.eq    4f /* Found nothing, go poison */
>> +	b       1b /* loop again */
>> +
>> +2:
> 
> IIUC, at this point, x1 was the start (lowest addr) of the stack, and x3
> was the quadword index of the first poison on that stack.
> 
> The x86 asm implicitly held that [x1, x3, lsl #3] address in RDI, but we
> don't have a copy...
> 
>> +	cmp     x3, #16
>> +	b.ls    4f /* Go poison if there are less than 16 things left? */
>> +
>> +	mov     x3, #16
> 
> ... and here we blat the index without saving it, meaning we always jump
> to close to the start of the stack, which I don't think was intentional.
> 

Urgh. You are right. I had an index register when I first
started out and then a different bug caused me to erroneously
remove it.

> So then we fall though to the below...
> 
>> +3:
>> +	ldr     x4, [x1, x3, lsl #3]
>> +	cmp     x4, x2  /* did we find the poison? */
>> +	b.ne    1b /* nope we have to check deeper */
>> +
>> +	sub     x3, x3, #1
>> +	cmp     x3, #0
>> +	b.eq    4f /* Found nothing, go poison */
>> +	b       3b /* loop again */
> 
> ... where we either:
> 
> * find 16 contiguous poison entries, leaving x3 == 0, or:
> 
> * we immediately find a non-poison value, and jump back to 1b. If there
>   are 16 non-poison values, we're left with x3 == 0, otherwise we bail
>   out and jump to 4f with x3 in the interval [0,15].
> 
> .... or I've completely confused myself, as suggested above.
> 
> We might not have x86's fancy string instructions, but we could do
> something equally impenetrable:
> 
> 	mov	x5, #0
> 1:
> 	cbz	x3, 4f
> 	ldr	x4, [x1, x3, lsl #3]
> 	cmp	x4, x2
> 	csinc	x5, xzr, x5, ne
> 	tbz	x5, #4, 4f		// found 16 poisons?
> 	sub	x3, x3, #1
> 	b	1b
> 
> That doesn't stop 16 slots early, so it can return any value of x3 all
> the way down to zero.
> 

Seems to work for my testing.

>> +
>> +	/* The poison function */
>> +4:
>> +	/* Generate the address we found */
>> +	add     x5, x1, x3, lsl #3
>> +	orr     x5, x5, #16
> 
> Have you figured out what the forced 16 byte offset is for?
> 
> I've not managed to figure out why it's there, and it looks like
> Alexander couldn't either, judging by his comments in the x86 asm.
>

>From get_wchan in arch/x86/kernel/process.c, it might be be to
account for the start of the frame correctly? I don't have a
definitive answer though and plan on just removing this for the
next version.
 
>> >> +	mov	x4, sp
>> +	/* subtrace the current pointer */
>> +	sub     x8, x4, x5
>> +
>> +	/* subtract one more so we don't touch the current sp */
>> +	sub	x8, x8, #1
>> +
>> +	/* sanity check */
>> +	cmp     x8, #THREAD_SIZE
>> +	b.lo    5f
>> +999:
>> +	b       999b
>> +
>> +5:
>> +	lsr     x8, x8, #3
>> +	mov     x3, x8
>> +6:
>> +	cmp     x3, #0
>> +	b.eq    7f
>> +
>> +	str     x2, [x1, x3, lsl #3]
>> +	sub     x3, x3, #1
>> +	b	6b
>> +
>> +	/* Reset the lowest stack to the top of the stack */
>> +7:
>> +	ldr	x1, [x0, TSK_STACK]
>> +	add	x1, x1, #THREAD_SIZE
>> +	sub	x1, x1, #256
>> +	str	x1, [x0, #TSK_TI_LOWEST_STACK]
> 
> I take it this is the offsetting you were querying?
> 
> I don't think it's quite right. Our stack looks like:
> 
> +---+ <- task_stack_page(p) + THREAD_SIZE
> |   |
> +---+ <- task_stack_page(p) + THREAD_START_SP
> |   |
> |   |
> +---+ <- task_pt_regs(p)
> |   |
> |   |
> |   |
> ~~~~~
> 
> ~~~~~
> |   |
> |   |
> |   |
> +---+ <- task_stack_page(p)
> 
> At the point we return to userspace, sp == task_pt_regs(p).
> 
> Judging by a generated asm-offsets.h, sizeof(struct_pt_regs) is 304
> bytes currently. THREAD_SIZE - THREAD_START_SP == 16.
> 
> We probably want to give that 16 a mnemonic (e.g FRAME_PADDING), and
> have something like:
> 
> 	ldr     x1, [x0, TSK_STACK]
> 	add	x1, x1, #THREAD_SIZE
> 	sub	x1, x1, #(S_FRAME_SIZE + FRAME_PADDING)
> 	str	x1, [x0, #TSK_TI_LOWEST_STACK]
> 

Yes, that looks cleaner. I suspect that even though we weren't
subtracting enough, it still worked because the lowest stack
would get overwritten in track_stack later.

>> +
>> +	mov	x0, x10
>> +	ret
>> +ENDPROC(erase_kstack)
>> +#endif
>> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
>> index 659ae80..1b6cca2 100644
>> --- a/arch/arm64/kernel/process.c
>> +++ b/arch/arm64/kernel/process.c
>> @@ -293,6 +293,12 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
>>  	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
>>  	p->thread.cpu_context.sp = (unsigned long)childregs;
>>  
>> +#ifdef CONFIG_STACKLEAK
>> +	p->thread.lowest_stack = (unsigned long)task_stack_page(p) +
>> +					2 * sizeof(unsigned long);
>> +#endif
> 
> I see this follows the x86 logic, but I don't see why it's necessary to
> offset this end of the stack.
> 
> Do you have an idea as to what this is for?
> 

Again, no and I think I'll again remove it.

>> +
>> +
>>  	ptrace_hw_copy_thread(p);
>>  
>>  	return 0;
>> @@ -417,3 +423,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
>>  	else
>>  		return randomize_page(mm->brk, SZ_1G);
>>  }
>> +
>> +#ifdef CONFIG_STACKLEAK
>> +void __used check_alloca(unsigned long size)
>> +{
>> +	unsigned long sp = (unsigned long)&sp, stack_left;
> 
> Nit: please use current_stack_pointer.
> 

Ack. This should also be cleaned up in the the track_stack
function as well.

> Thanks,
> Mark.
> 

Thanks,
Laura

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.