Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALCETrVvoui0vksdt0Y9rdGL5ipEn_FtSXVVUFdH03ZC93cy_A@mail.gmail.com>
Date: Wed, 5 Dec 2018 15:13:56 -0800
From: Andy Lutomirski <luto@...nel.org>
To: Igor Stoppa <igor.stoppa@...il.com>, linux-arch <linux-arch@...r.kernel.org>, 
	linux-s390 <linux-s390@...r.kernel.org>, Martin Schwidefsky <schwidefsky@...ibm.com>, 
	Heiko Carstens <heiko.carstens@...ibm.com>, 
	Benjamin Herrenschmidt <benh@...nel.crashing.org>
Cc: Kees Cook <keescook@...omium.org>, Matthew Wilcox <willy@...radead.org>, 
	Igor Stoppa <igor.stoppa@...wei.com>, Nadav Amit <nadav.amit@...il.com>, 
	Peter Zijlstra <peterz@...radead.org>, Dave Hansen <dave.hansen@...ux.intel.com>, 
	linux-integrity <linux-integrity@...r.kernel.org>, 
	Kernel Hardening <kernel-hardening@...ts.openwall.com>, Linux-MM <linux-mm@...ck.org>, 
	LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 2/6] __wr_after_init: write rare for static allocation

I added some s390 and powerpc people.

On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@...il.com> wrote:
>
> Implementation of write rare for statically allocated data, located in a
> specific memory section through the use of the __write_rare label.
>
> The basic functions are:
> - wr_memset(): write rare counterpart of memset()
> - wr_memcpy(): write rare counterpart of memcpy()
> - wr_assign(): write rare counterpart of the assignment ('=') operator
> - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
>
> The implementation is based on code from Andy Lutomirski and Nadav Amit
> for patching the text on x86 [here goes reference to commits, once merged]
>
> The modification of write protected data is done through an alternate
> mapping of the same pages, as writable.
> This mapping is local to each core and is active only for the duration
> of each write operation.
> Local interrupts are disabled, while the alternate mapping is active.
>
> In theory, it could introduce a non-predictable delay, in a preemptible
> system, however the amount of data to be altered is likely to be far
> smaller than a page.
>
> Signed-off-by: Igor Stoppa <igor.stoppa@...wei.com>
>
> CC: Andy Lutomirski <luto@...capital.net>
> CC: Nadav Amit <nadav.amit@...il.com>
> CC: Matthew Wilcox <willy@...radead.org>
> CC: Peter Zijlstra <peterz@...radead.org>
> CC: Kees Cook <keescook@...omium.org>
> CC: Dave Hansen <dave.hansen@...ux.intel.com>
> CC: linux-integrity@...r.kernel.org
> CC: kernel-hardening@...ts.openwall.com
> CC: linux-mm@...ck.org
> CC: linux-kernel@...r.kernel.org
> ---
>  include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
>  init/main.c           |   2 +
>  mm/Kconfig            |   4 ++
>  mm/Makefile           |   1 +
>  mm/prmem.c            | 124 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 264 insertions(+)
>  create mode 100644 include/linux/prmem.h
>  create mode 100644 mm/prmem.c
>
> diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> new file mode 100644
> index 000000000000..b0131c1f5dc0
> --- /dev/null
> +++ b/include/linux/prmem.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * prmem.h: Header for memory protection library
> + *
> + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@...wei.com>
> + *
> + * Support for:
> + * - statically allocated write rare data
> + */
> +
> +#ifndef _LINUX_PRMEM_H
> +#define _LINUX_PRMEM_H
> +
> +#include <linux/set_memory.h>
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/mutex.h>
> +#include <linux/compiler.h>
> +#include <linux/irqflags.h>
> +
> +/**
> + * memtst() - test n bytes of the source to match the c value
> + * @p: beginning of the memory to test
> + * @c: byte to compare against
> + * @len: amount of bytes to test
> + *
> + * Returns 0 on success, non-zero otherwise.
> + */
> +static inline int memtst(void *p, int c, __kernel_size_t len)
> +{
> +       __kernel_size_t i;
> +
> +       for (i = 0; i < len; i++) {
> +               u8 d =  *(i + (u8 *)p) - (u8)c;
> +
> +               if (unlikely(d))
> +                       return d;
> +       }
> +       return 0;
> +}
> +
> +
> +#ifndef CONFIG_PRMEM
> +
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> +       return memset(p, c, len);
> +}
> +
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> +       return memcpy(p, q, size);
> +}
> +
> +#define wr_assign(var, val)    ((var) = (val))
> +
> +#define wr_rcu_assign_pointer(p, v)    \
> +       rcu_assign_pointer(p, v)
> +
> +#else
> +
> +enum wr_op_type {
> +       WR_MEMCPY,
> +       WR_MEMSET,
> +       WR_RCU_ASSIGN_PTR,
> +       WR_OPS_NUMBER,
> +};
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> +             enum wr_op_type op);
> +
> +/**
> + * wr_memset() - sets n bytes of the destination to the c value
> + * @p: beginning of the memory to write to
> + * @c: byte to replicate
> + * @len: amount of bytes to copy
> + *
> + * Returns true on success, false otherwise.
> + */
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> +       return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> +}
> +
> +/**
> + * wr_memcpy() - copyes n bytes from source to destination
> + * @dst: beginning of the memory to write to
> + * @src: beginning of the memory to read from
> + * @n_bytes: amount of bytes to copy
> + *
> + * Returns pointer to the destination
> + */
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> +       return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> +}
> +
> +/**
> + * wr_assign() - sets a write-rare variable to a specified value
> + * @var: the variable to set
> + * @val: the new value
> + *
> + * Returns: the variable
> + *
> + * Note: it might be possible to optimize this, to use wr_memset in some
> + * cases (maybe with NULL?).
> + */
> +
> +#define wr_assign(var, val) ({                 \
> +       typeof(var) tmp = (typeof(var))val;     \
> +                                               \
> +       wr_memcpy(&var, &tmp, sizeof(var));     \
> +       var;                                    \
> +})
> +
> +/**
> + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> + * @p: the rcu pointer
> + * @v: the new value
> + *
> + * Returns the value assigned to the rcu pointer.
> + *
> + * It is provided as macro, to match rcu_assign_pointer()
> + */
> +#define wr_rcu_assign_pointer(p, v) ({                                 \
> +       __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR);    \
> +       p;                                                              \
> +})
> +#endif
> +#endif
> diff --git a/init/main.c b/init/main.c
> index a461150adfb1..a36f2e54f937 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
>  void __init __weak mem_encrypt_init(void) { }
>
>  void __init __weak poking_init(void) { }
> +void __init __weak wr_poking_init(void) { }
>
>  bool initcall_debug;
>  core_param(initcall_debug, initcall_debug, bool, 0644);
> @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
>         delayacct_init();
>
>         poking_init();
> +       wr_poking_init();
>         check_bugs();
>
>         acpi_subsystem_init();
> diff --git a/mm/Kconfig b/mm/Kconfig
> index d85e39da47ae..9b09339c027f 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
>  config MEMORY_ISOLATION
>         bool
>
> +config PRMEM
> +       def_bool n
> +       depends on STRICT_KERNEL_RWX && X86_64
> +
>  #
>  # Only be set on architectures that have completely implemented memory hotplug
>  # feature. If you are not sure, don't touch it.
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..ef3867c16ce0 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM)       += sparse.o
>  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>  obj-$(CONFIG_SLOB) += slob.o
>  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> +obj-$(CONFIG_PRMEM) += prmem.o
>  obj-$(CONFIG_KSM) += ksm.o
>  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
>  obj-$(CONFIG_SLAB) += slab.o
> diff --git a/mm/prmem.c b/mm/prmem.c
> new file mode 100644
> index 000000000000..e8ab76701831
> --- /dev/null
> +++ b/mm/prmem.c
> @@ -0,0 +1,124 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * prmem.c: Memory Protection Library
> + *
> + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@...wei.com>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +#include <linux/compiler.h>
> +#include <linux/slab.h>
> +#include <linux/mmu_context.h>
> +#include <linux/rcupdate.h>
> +#include <linux/prmem.h>
> +
> +static __ro_after_init bool wr_ready;
> +static __ro_after_init struct mm_struct *wr_poking_mm;
> +static __ro_after_init unsigned long wr_poking_base;
> +
> +/*
> + * The following two variables are statically allocated by the linker
> + * script at the the boundaries of the memory region (rounded up to
> + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> + */
> +extern long __start_wr_after_init;
> +extern long __end_wr_after_init;
> +
> +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> +{
> +       unsigned long start = (unsigned long)&__start_wr_after_init;
> +       unsigned long end = (unsigned long)&__end_wr_after_init;
> +       unsigned long low = ptr;
> +       unsigned long high = ptr + size;
> +
> +       return likely(start <= low && low <= high && high <= end);
> +}
> +
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> +             enum wr_op_type op)
> +{

You might end up wanting something like:

#ifdef __arch_wr_op
return __arch_wr_op(...);
#endif

if an arch (s390? powerpc?) decides to have a totally different
implementation of this.

Hi s390 and powerpc people: it would be nice if this generic
implementation *worked* on your architectures and that it will allow
you to add some straightforward way to add a better arch-specific
implementation if you think that would be better.

--Andy

> +       temporary_mm_state_t prev;
> +       unsigned long flags;
> +       unsigned long offset;
> +       unsigned long wr_poking_addr;
> +
> +       /* Confirm that the writable mapping exists. */
> +       BUG_ON(!wr_ready);
> +
> +       if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> +           WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> +               return (void *)dst;
> +
> +       offset = dst - (unsigned long)&__start_wr_after_init;
> +       wr_poking_addr = wr_poking_base + offset;
> +       local_irq_save(flags);
> +       prev = use_temporary_mm(wr_poking_mm);
> +
> +       kasan_disable_current();
> +       if (op == WR_MEMCPY)
> +               memcpy((void *)wr_poking_addr, (void *)src, len);
> +       else if (op == WR_MEMSET)
> +               memset((u8 *)wr_poking_addr, (u8)src, len);
> +       else if (op == WR_RCU_ASSIGN_PTR)
> +               /* generic version of rcu_assign_pointer */
> +               smp_store_release((void **)wr_poking_addr,
> +                                 RCU_INITIALIZER((void **)src));
> +       kasan_enable_current();

Hmm.  I suspect this will explode quite badly on sane architectures
like s390.  (In my book, despite how weird s390 is, it has a vastly
nicer model of "user" memory than any other architecture I know
of...).  I think you should use copy_to_user(), etc, instead.  I'm not
entirely sure what the best smp_store_release() replacement is.
Making this change may also mean you can get rid of the
kasan_disable_current().

> +
> +       barrier(); /* XXX redundant? */

I think it's redundant.  If unuse_temporary_mm() allows earlier stores
to hit the wrong address space, then something is very very wrong, and
something is also very very wrong if the optimizer starts moving
stores across a function call that is most definitely a barrier.

> +
> +       unuse_temporary_mm(prev);
> +       /* XXX make the verification optional? */
> +       if (op == WR_MEMCPY)
> +               BUG_ON(memcmp((void *)dst, (void *)src, len));
> +       else if (op == WR_MEMSET)
> +               BUG_ON(memtst((void *)dst, (u8)src, len));
> +       else if (op == WR_RCU_ASSIGN_PTR)
> +               BUG_ON(*(unsigned long *)dst != src);

Hmm.  If you allowed cmpxchg or even plain xchg, then these bug_ons
would be thoroughly buggy, but maybe they're okay.  But they should,
at most, be WARN_ON_ONCE(), given that you can trigger them by writing
the same addresses from two threads at once, and this isn't even
entirely obviously bogus given the presence of smp_store_release().

> +       local_irq_restore(flags);
> +       return (void *)dst;
> +}
> +
> +struct mm_struct *copy_init_mm(void);
> +void __init wr_poking_init(void)
> +{
> +       unsigned long start = (unsigned long)&__start_wr_after_init;
> +       unsigned long end = (unsigned long)&__end_wr_after_init;
> +       unsigned long i;
> +       unsigned long wr_range;
> +
> +       wr_poking_mm = copy_init_mm();
> +       BUG_ON(!wr_poking_mm);
> +
> +       /* XXX What if it's too large to fit in the task unmapped mem? */
> +       wr_range = round_up(end - start, PAGE_SIZE);
> +
> +       /* Randomize the poking address base*/
> +       wr_poking_base = TASK_UNMAPPED_BASE +
> +               (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> +               (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));
> +
> +       /* Create alternate mapping for the entire wr_after_init range. */
> +       for (i = start; i < end; i += PAGE_SIZE) {
> +               struct page *page;
> +               spinlock_t *ptl;
> +               pte_t pte;
> +               pte_t *ptep;
> +               unsigned long wr_poking_addr;
> +
> +               BUG_ON(!(page = virt_to_page(i)));
> +               wr_poking_addr = i - start + wr_poking_base;
> +
> +               /* The lock is not needed, but avoids open-coding. */
> +               ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> +               VM_BUG_ON(!ptep);
> +
> +               pte = mk_pte(page, PAGE_KERNEL);
> +               set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> +               spin_unlock(ptl);
> +       }
> +       wr_ready = true;
> +}
> --
> 2.19.1
>

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.