kernel-hardening - Re: [PATCH v3 07/21] arm64: move kernel image to base of vmalloc area

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKv+Gu86oVvGb75S2x-44ct4sdQt4zT-3HbPqMJo_X+Cm3EPVQ@mail.gmail.com>
Date: Wed, 13 Jan 2016 10:58:55 +0100
From: Ard Biesheuvel <ard.biesheuvel@...aro.org>
To: Mark Rutland <mark.rutland@....com>
Cc: "linux-arm-kernel@...ts.infradead.org" <linux-arm-kernel@...ts.infradead.org>, 
	kernel-hardening@...ts.openwall.com, Will Deacon <will.deacon@....com>, 
	Catalin Marinas <catalin.marinas@....com>, Leif Lindholm <leif.lindholm@...aro.org>, 
	Kees Cook <keescook@...omium.org>, 
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>, Stuart Yoder <stuart.yoder@...escale.com>, 
	Sharma Bhupesh <bhupesh.sharma@...escale.com>, Arnd Bergmann <arnd@...db.de>, 
	Marc Zyngier <marc.zyngier@....com>, Christoffer Dall <christoffer.dall@...aro.org>
Subject: Re: [PATCH v3 07/21] arm64: move kernel image to base of vmalloc area

On 13 January 2016 at 09:39, Ard Biesheuvel <ard.biesheuvel@...aro.org> wrote:
> On 12 January 2016 at 19:14, Mark Rutland <mark.rutland@....com> wrote:
>> On Mon, Jan 11, 2016 at 02:19:00PM +0100, Ard Biesheuvel wrote:
>>> This moves the module area to right before the vmalloc area, and
>>> moves the kernel image to the base of the vmalloc area. This is
>>> an intermediate step towards implementing kASLR, where the kernel
>>> image can be located anywhere in the vmalloc area.
>>>
>>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@...aro.org>
>>> ---
>>>  arch/arm64/include/asm/kasan.h          | 20 ++++---
>>>  arch/arm64/include/asm/kernel-pgtable.h |  5 +-
>>>  arch/arm64/include/asm/memory.h         | 18 ++++--
>>>  arch/arm64/include/asm/pgtable.h        |  7 ---
>>>  arch/arm64/kernel/setup.c               | 12 ++++
>>>  arch/arm64/mm/dump.c                    | 12 ++--
>>>  arch/arm64/mm/init.c                    | 20 +++----
>>>  arch/arm64/mm/kasan_init.c              | 21 +++++--
>>>  arch/arm64/mm/mmu.c                     | 62 ++++++++++++++------
>>>  9 files changed, 118 insertions(+), 59 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
>>> index de0d21211c34..2c583dbf4746 100644
>>> --- a/arch/arm64/include/asm/kasan.h
>>> +++ b/arch/arm64/include/asm/kasan.h
>>> @@ -1,20 +1,16 @@
>>>  #ifndef __ASM_KASAN_H
>>>  #define __ASM_KASAN_H
>>>
>>> -#ifndef __ASSEMBLY__
>>> -
>>>  #ifdef CONFIG_KASAN
>>>
>>>  #include <linux/linkage.h>
>>> -#include <asm/memory.h>
>>> -#include <asm/pgtable-types.h>
>>>
>>>  /*
>>>   * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
>>>   * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses.
>>>   */
>>> -#define KASAN_SHADOW_START      (VA_START)
>>> -#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1UL << (VA_BITS - 3)))
>>> +#define KASAN_SHADOW_START   (VA_START)
>>> +#define KASAN_SHADOW_END     (KASAN_SHADOW_START + (_AC(1, UL) << (VA_BITS - 3)))
>>>
>>>  /*
>>>   * This value is used to map an address to the corresponding shadow
>>> @@ -26,16 +22,22 @@
>>>   * should satisfy the following equation:
>>>   *      KASAN_SHADOW_OFFSET = KASAN_SHADOW_END - (1ULL << 61)
>>>   */
>>> -#define KASAN_SHADOW_OFFSET     (KASAN_SHADOW_END - (1ULL << (64 - 3)))
>>> +#define KASAN_SHADOW_OFFSET  (KASAN_SHADOW_END - (_AC(1, ULL) << (64 - 3)))
>>> +
>>
>> I couldn't immediately spot where KASAN_SHADOW_* were used in assembly.
>> I guess there's some other definition built atop of them that I've
>> missed.
>>
>> Where should I be looking?
>>
>
> Well, the problem is that KIMAGE_VADDR will be defined in terms of
> KASAN_SHADOW_END if KASAN is enabled. But since KASAN always uses the
> first 1/8 of that VA space, I am going to rework this so that the
> non-KASAN constants never depend on the actual values but only on
> CONFIG_KASAN
>
>>> +#ifndef __ASSEMBLY__
>>> +#include <asm/pgtable-types.h>
>>>
>>>  void kasan_init(void);
>>>  void kasan_copy_shadow(pgd_t *pgdir);
>>>  asmlinkage void kasan_early_init(void);
>>> +#endif
>>>
>>>  #else
>>> +
>>> +#ifndef __ASSEMBLY__
>>>  static inline void kasan_init(void) { }
>>>  static inline void kasan_copy_shadow(pgd_t *pgdir) { }
>>>  #endif
>>>
>>> -#endif
>>> -#endif
>>> +#endif /* CONFIG_KASAN */
>>> +#endif /* __ASM_KASAN_H */
>>> diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
>>> index a459714ee29e..daa8a7b9917a 100644
>>> --- a/arch/arm64/include/asm/kernel-pgtable.h
>>> +++ b/arch/arm64/include/asm/kernel-pgtable.h
>>> @@ -70,8 +70,9 @@
>>>  /*
>>>   * Initial memory map attributes.
>>>   */
>>> -#define SWAPPER_PTE_FLAGS    (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
>>> -#define SWAPPER_PMD_FLAGS    (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
>>> +#define SWAPPER_PTE_FLAGS    (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED | PTE_UXN)
>>> +#define SWAPPER_PMD_FLAGS    (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S | \
>>> +                              PMD_SECT_UXN)
>>
>> This will only affect the tables created in head.S. Before we start
>> userspace we'll have switched over to a new set of tables using
>> PAGE_KERNEL (including UXN).
>>
>> Given that, this doesn't look necessary for the vmalloc area changes. Am
>> I missing something?
>>
>
> No, this was carried over from an older version of the series, when
> the kernel mapping, after having been moved below PAGE_OFFSET, would
> not be overridden by the memblock based linear mapping routines, and
> so would missing the UXN bit. But with your changes, this can indeed
> be dropped.
>
>>>  #if ARM64_SWAPPER_USES_SECTION_MAPS
>>>  #define SWAPPER_MM_MMUFLAGS  (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
>>> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
>>> index bea9631b34a8..e45d3141ad98 100644
>>> --- a/arch/arm64/include/asm/memory.h
>>> +++ b/arch/arm64/include/asm/memory.h
>>> @@ -51,14 +51,24 @@
>>>  #define VA_BITS                      (CONFIG_ARM64_VA_BITS)
>>>  #define VA_START             (UL(0xffffffffffffffff) << VA_BITS)
>>>  #define PAGE_OFFSET          (UL(0xffffffffffffffff) << (VA_BITS - 1))
>>> -#define KIMAGE_VADDR         (PAGE_OFFSET)
>>> -#define MODULES_END          (KIMAGE_VADDR)
>>> -#define MODULES_VADDR                (MODULES_END - SZ_64M)
>>> -#define PCI_IO_END           (MODULES_VADDR - SZ_2M)
>>> +#define PCI_IO_END           (PAGE_OFFSET - SZ_2M)
>>>  #define PCI_IO_START         (PCI_IO_END - PCI_IO_SIZE)
>>>  #define FIXADDR_TOP          (PCI_IO_START - SZ_2M)
>>>  #define TASK_SIZE_64         (UL(1) << VA_BITS)
>>>
>>> +#ifndef CONFIG_KASAN
>>> +#define MODULES_VADDR                (VA_START)
>>> +#else
>>> +#include <asm/kasan.h>
>>> +#define MODULES_VADDR                (KASAN_SHADOW_END)
>>> +#endif
>>> +
>>> +#define MODULES_VSIZE                (SZ_64M)
>>> +#define MODULES_END          (MODULES_VADDR + MODULES_VSIZE)
>>> +
>>> +#define KIMAGE_VADDR         (MODULES_END)
>>> +#define VMALLOC_START                (MODULES_END)
>>> +
>>>  #ifdef CONFIG_COMPAT
>>>  #define TASK_SIZE_32         UL(0x100000000)
>>>  #define TASK_SIZE            (test_thread_flag(TIF_32BIT) ? \
>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>> index 7b4e16068c9f..a910a44d7ab3 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -42,13 +42,6 @@
>>>   */
>>>  #define VMEMMAP_SIZE         ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
>>>
>>> -#ifndef CONFIG_KASAN
>>> -#define VMALLOC_START                (VA_START)
>>> -#else
>>> -#include <asm/kasan.h>
>>> -#define VMALLOC_START                (KASAN_SHADOW_END + SZ_64K)
>>> -#endif
>>> -
>>>  #define VMALLOC_END          (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
>>
>> It's a shame VMALLOC_START and VMALLOC_END are now in different headers.
>> It would be nice if we could keep them together.
>>
>> As VMEMMAP_SIZE depends on sizeof(struct page), it's not just a simple
>> move. We could either place that in the !__ASSEMBLY__ portion of
>> memory.h, or we could add S_PAGE to asm-offsets.
>>
>> If that's too painful now, we can leave that for subsequent cleanup;
>> there's other stuff in that area I'd like to unify at some point (e.g.
>> the mem_init and dump.c section boundary descriptions).
>>
>
> No, I think I can probably do a bit better than this. I will address it in v4
>
>>>
>>>  #define vmemmap                      ((struct page *)(VMALLOC_END + SZ_64K))
>>> diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
>>> index cfed56f0ad26..c67ba4453ec6 100644
>>> --- a/arch/arm64/kernel/setup.c
>>> +++ b/arch/arm64/kernel/setup.c
>>> @@ -53,6 +53,7 @@
>>>  #include <asm/cpufeature.h>
>>>  #include <asm/cpu_ops.h>
>>>  #include <asm/kasan.h>
>>> +#include <asm/kernel-pgtable.h>
>>>  #include <asm/sections.h>
>>>  #include <asm/setup.h>
>>>  #include <asm/smp_plat.h>
>>> @@ -291,6 +292,17 @@ u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
>>>
>>>  void __init setup_arch(char **cmdline_p)
>>>  {
>>> +     static struct vm_struct vmlinux_vm;
>>> +
>>> +     vmlinux_vm.addr         = (void *)KIMAGE_VADDR;
>>> +     vmlinux_vm.size         = round_up((u64)_end - KIMAGE_VADDR,
>>> +                                        SWAPPER_BLOCK_SIZE);
>>
>> With the fine grained tables we should only need to round up to
>> PAGE_SIZE (though _end is implicitly page-aligned anyway). Given that,
>> is the SWAPPER_BLOCK_SIZE rounding necessary?
>>
>
> No, probably not.
>
>>> +     vmlinux_vm.phys_addr    = __pa(KIMAGE_VADDR);
>>> +     vmlinux_vm.flags        = VM_MAP;
>>
>> I was going to say we should set VM_KASAN also per its description in
>> include/vmalloc.h, though per its uses its not clear if it will ever
>> matter.
>>
>
> No, we shouldn't. Even if we are never going to unmap this vma,
> setting the flag will result in the shadow area being freed using
> vfree(), while it was not allocated via vmalloc() so that is likely to
> cause trouble.
>
>>> +     vmlinux_vm.caller       = setup_arch;
>>> +
>>> +     vm_area_add_early(&vmlinux_vm);
>>
>> Do we need to register the kernel VA range quite this early, or could we
>> do this around paging_init/map_kernel time?
>>
>
> No. Locally, I moved it into map_kernel_chunk, so that we have
> separate areas for _text, _init and _data, and we can unmap the _init
> entirely rather than only stripping the exec bit. I haven't quite
> figured out how to get rid of the vma area, but perhaps it make sense
> to keep it reserved, so that modules don't end up there later (which
> is possible with the module region randomization I have implemented
> for v4) since I don't know how well things like kallsyms etc cope with
> that.
>
>>> +
>>>       pr_info("Boot CPU: AArch64 Processor [%08x]\n", read_cpuid_id());
>>>
>>>       sprintf(init_utsname()->machine, ELF_PLATFORM);
>>> diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
>>> index 5a22a119a74c..e83ffb00560c 100644
>>> --- a/arch/arm64/mm/dump.c
>>> +++ b/arch/arm64/mm/dump.c
>>> @@ -35,7 +35,9 @@ struct addr_marker {
>>>  };
>>>
>>>  enum address_markers_idx {
>>> -     VMALLOC_START_NR = 0,
>>> +     MODULES_START_NR = 0,
>>> +     MODULES_END_NR,
>>> +     VMALLOC_START_NR,
>>>       VMALLOC_END_NR,
>>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>>       VMEMMAP_START_NR,
>>> @@ -45,12 +47,12 @@ enum address_markers_idx {
>>>       FIXADDR_END_NR,
>>>       PCI_START_NR,
>>>       PCI_END_NR,
>>> -     MODULES_START_NR,
>>> -     MODUELS_END_NR,
>>>       KERNEL_SPACE_NR,
>>>  };
>>>
>>>  static struct addr_marker address_markers[] = {
>>> +     { MODULES_VADDR,        "Modules start" },
>>> +     { MODULES_END,          "Modules end" },
>>>       { VMALLOC_START,        "vmalloc() Area" },
>>>       { VMALLOC_END,          "vmalloc() End" },
>>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>> @@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = {
>>>       { FIXADDR_TOP,          "Fixmap end" },
>>>       { PCI_IO_START,         "PCI I/O start" },
>>>       { PCI_IO_END,           "PCI I/O end" },
>>> -     { MODULES_VADDR,        "Modules start" },
>>> -     { MODULES_END,          "Modules end" },
>>> -     { PAGE_OFFSET,          "Kernel Mapping" },
>>> +     { PAGE_OFFSET,          "Linear Mapping" },
>>>       { -1,                   NULL },
>>>  };
>>>
>>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>>> index f3b061e67bfe..baa923bda651 100644
>>> --- a/arch/arm64/mm/init.c
>>> +++ b/arch/arm64/mm/init.c
>>> @@ -302,22 +302,26 @@ void __init mem_init(void)
>>>  #ifdef CONFIG_KASAN
>>>                 "    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n"
>>>  #endif
>>> +               "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
>>>                 "    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n"
>>> +               "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
>>> +               "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
>>> +               "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n"
>>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>>                 "    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
>>>                 "              0x%16lx - 0x%16lx   (%6ld MB actual)\n"
>>>  #endif
>>>                 "    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n"
>>>                 "    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n"
>>> -               "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
>>> -               "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n"
>>> -               "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
>>> -               "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
>>> -               "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
>>> +               "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
>>>  #ifdef CONFIG_KASAN
>>>                 MLG(KASAN_SHADOW_START, KASAN_SHADOW_END),
>>>  #endif
>>> +               MLM(MODULES_VADDR, MODULES_END),
>>>                 MLG(VMALLOC_START, VMALLOC_END),
>>> +               MLK_ROUNDUP(__init_begin, __init_end),
>>> +               MLK_ROUNDUP(_text, _etext),
>>> +               MLK_ROUNDUP(_sdata, _edata),
>>>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>>>                 MLG((unsigned long)vmemmap,
>>>                     (unsigned long)vmemmap + VMEMMAP_SIZE),
>>> @@ -326,11 +330,7 @@ void __init mem_init(void)
>>>  #endif
>>>                 MLK(FIXADDR_START, FIXADDR_TOP),
>>>                 MLM(PCI_IO_START, PCI_IO_END),
>>> -               MLM(MODULES_VADDR, MODULES_END),
>>> -               MLM(PAGE_OFFSET, (unsigned long)high_memory),
>>> -               MLK_ROUNDUP(__init_begin, __init_end),
>>> -               MLK_ROUNDUP(_text, _etext),
>>> -               MLK_ROUNDUP(_sdata, _edata));
>>> +               MLM(PAGE_OFFSET, (unsigned long)high_memory));
>>>
>>>  #undef MLK
>>>  #undef MLM
>>> diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
>>> index 0ca411fc5ea3..acdd1ac166ec 100644
>>> --- a/arch/arm64/mm/kasan_init.c
>>> +++ b/arch/arm64/mm/kasan_init.c
>>> @@ -17,9 +17,11 @@
>>>  #include <linux/start_kernel.h>
>>>
>>>  #include <asm/mmu_context.h>
>>> +#include <asm/kernel-pgtable.h>
>>>  #include <asm/page.h>
>>>  #include <asm/pgalloc.h>
>>>  #include <asm/pgtable.h>
>>> +#include <asm/sections.h>
>>>  #include <asm/tlbflush.h>
>>>
>>>  static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
>>> @@ -33,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
>>>       if (pmd_none(*pmd))
>>>               pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
>>>
>>> -     pte = pte_offset_kernel(pmd, addr);
>>> +     pte = pte_offset_kimg(pmd, addr);
>>>       do {
>>>               next = addr + PAGE_SIZE;
>>>               set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
>>> @@ -51,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
>>>       if (pud_none(*pud))
>>>               pud_populate(&init_mm, pud, kasan_zero_pmd);
>>>
>>> -     pmd = pmd_offset(pud, addr);
>>> +     pmd = pmd_offset_kimg(pud, addr);
>>>       do {
>>>               next = pmd_addr_end(addr, end);
>>>               kasan_early_pte_populate(pmd, addr, next);
>>> @@ -68,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
>>>       if (pgd_none(*pgd))
>>>               pgd_populate(&init_mm, pgd, kasan_zero_pud);
>>>
>>> -     pud = pud_offset(pgd, addr);
>>> +     pud = pud_offset_kimg(pgd, addr);
>>>       do {
>>>               next = pud_addr_end(addr, end);
>>>               kasan_early_pmd_populate(pud, addr, next);
>>> @@ -126,8 +128,14 @@ static void __init clear_pgds(unsigned long start,
>>>
>>>  void __init kasan_init(void)
>>>  {
>>> +     u64 kimg_shadow_start, kimg_shadow_end;
>>>       struct memblock_region *reg;
>>>
>>> +     kimg_shadow_start = round_down((u64)kasan_mem_to_shadow(_text),
>>> +                                    SWAPPER_BLOCK_SIZE);
>>> +     kimg_shadow_end = round_up((u64)kasan_mem_to_shadow(_end),
>>> +                                SWAPPER_BLOCK_SIZE);
>>
>> This rounding looks suspect to me, given it's applied to the shadow
>> addresses rather than the kimage addresses. That's roughly equivalent to
>> kasan_mem_to_shadow(round_up(_end, 8 * SWAPPER_BLOCK_SIZE).
>>
>> I don't think we need any rounding for the kimage addresses. The image
>> end is page-granular (and the fine-grained mapping will reflect that).
>> Any accesses between _end and roud_up(_end, SWAPPER_BLOCK_SIZE) would be
>> bugs (and would most likely fault) regardless of KASAN.
>>
>> Or am I just being thick here?
>>
>
> Well, the problem here is that vmemmap_populate() is used as a
> surrogate vmalloc() since that is not available yet, and
> vmemmap_populate() allocates in SWAPPER_BLOCK_SIZE granularity.
> If I remove the rounding, I get false positive kasan errors which I
> have not quite diagnosed yet, but are probably due to the fact that
> the rounding performed by vmemmap_populate() goes in the wrong
> direction.
>
> I do wonder what that means for memblocks that are not multiples of 16
> MB, though (below)
>
>>> +
>>>       /*
>>>        * We are going to perform proper setup of shadow memory.
>>>        * At first we should unmap early shadow (clear_pgds() call bellow).
>>> @@ -141,8 +149,13 @@ void __init kasan_init(void)
>>>
>>>       clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
>>>
>>> +     vmemmap_populate(kimg_shadow_start, kimg_shadow_end,
>>> +                      pfn_to_nid(virt_to_pfn(kimg_shadow_start)));
>>
>> That virt_to_pfn doesn't look right -- kimg_shadow_start is neither a
>> linear address nor an image address. As pfn_to_nid is hard-coded to 0
>> for !NUMA this happens to be ok for us for the moment.
>>
>> I think we should follow the x86 KASAN code and use NUMA_NO_NODE for
>> this for now.
>>
>
> Ack
>
>>> +
>>>       kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
>>> -                     kasan_mem_to_shadow((void *)MODULES_VADDR));
>>> +                                (void *)kimg_shadow_start);
>>> +     kasan_populate_zero_shadow((void *)kimg_shadow_end,
>>> +                                kasan_mem_to_shadow((void *)PAGE_OFFSET));
>>>
>>>       for_each_memblock(memory, reg) {
>>>               void *start = (void *)__phys_to_virt(reg->base);
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 75b5f0dc3bdc..0b28f1469f9b 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -53,6 +53,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
>>>  unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
>>>  EXPORT_SYMBOL(empty_zero_page);
>>>
>>> +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
>>> +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
>>> +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
>>> +
>>>  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>>                             unsigned long size, pgprot_t vma_prot)
>>>  {
>>> @@ -349,14 +353,14 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
>>>  {
>>>
>>>       unsigned long kernel_start = __pa(_stext);
>>> -     unsigned long kernel_end = __pa(_end);
>>> +     unsigned long kernel_end = __pa(_etext);
>>>
>>>       /*
>>> -      * The kernel itself is mapped at page granularity. Map all other
>>> -      * memory, making sure we don't overwrite the existing kernel mappings.
>>> +      * Take care not to create a writable alias for the
>>> +      * read-only text and rodata sections of the kernel image.
>>>        */
>>>
>>> -     /* No overlap with the kernel. */
>>> +     /* No overlap with the kernel text */
>>>       if (end < kernel_start || start >= kernel_end) {
>>>               __create_pgd_mapping(pgd, start, __phys_to_virt(start),
>>>                                    end - start, PAGE_KERNEL,
>>> @@ -365,7 +369,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
>>>       }
>>>
>>>       /*
>>> -      * This block overlaps the kernel mapping. Map the portion(s) which
>>> +      * This block overlaps the kernel text mapping. Map the portion(s) which
>>>        * don't overlap.
>>>        */
>>>       if (start < kernel_start)
>>> @@ -438,12 +442,29 @@ static void __init map_kernel(pgd_t *pgd)
>>>       map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC);
>>>       map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL);
>>>
>>> -     /*
>>> -      * The fixmap falls in a separate pgd to the kernel, and doesn't live
>>> -      * in the carveout for the swapper_pg_dir. We can simply re-use the
>>> -      * existing dir for the fixmap.
>>> -      */
>>> -     set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START));
>>> +     if (pgd_index(FIXADDR_START) != pgd_index((u64)_end)) {
>>
>> To match the style of early_fixmap_init, and given we already mapped the
>> kernel image, this could be:
>>
>>         if (pgd_none(pgd_offset_raw(pgd, FIXADDR_START))) {
>>
>> Which also serves as a run-time check that the pgd entry really was
>> clear.
>>
>
> Yes, that looks better. I will steal that :-)
>

OK, that doesn't work. pgd_none() is hardcoded to 'false' when running
with fewer than 4 pgtable levels, and so we always hit the BUG() here.


>> Other than that, this looks good to me!
>>
>
> Thanks!
>
>>> +             /*
>>> +              * The fixmap falls in a separate pgd to the kernel, and doesn't
>>> +              * live in the carveout for the swapper_pg_dir. We can simply
>>> +              * re-use the existing dir for the fixmap.
>>> +              */
>>> +             set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
>>> +                     *pgd_offset_k(FIXADDR_START));
>>> +     } else if (CONFIG_PGTABLE_LEVELS > 3) {
>>> +             /*
>>> +              * The fixmap shares its top level pgd entry with the kernel
>>> +              * mapping. This can really only occur when we are running
>>> +              * with 16k/4 levels, so we can simply reuse the pud level
>>> +              * entry instead.
>>> +              */
>>> +             BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
>>> +
>>> +             set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
>>> +                     __pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
>>> +             pud_clear_fixmap();
>>> +     } else {
>>> +             BUG();
>>> +     }
>>>
>>>       kasan_copy_shadow(pgd);
>>>  }
>>> @@ -569,10 +590,6 @@ void vmemmap_free(unsigned long start, unsigned long end)
>>>  }
>>>  #endif       /* CONFIG_SPARSEMEM_VMEMMAP */
>>>
>>> -static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
>>> -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
>>> -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
>>> -
>>>  static inline pud_t * fixmap_pud(unsigned long addr)
>>>  {
>>>       return (CONFIG_PGTABLE_LEVELS > 3) ? &bm_pud[pud_index(addr)]
>>> @@ -598,8 +615,19 @@ void __init early_fixmap_init(void)
>>>       unsigned long addr = FIXADDR_START;
>>>
>>>       pgd = pgd_offset_k(addr);
>>> -     pgd_populate(&init_mm, pgd, bm_pud);
>>> -     pud = fixmap_pud(addr);
>>> +     if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) {
>>> +             /*
>>> +              * We only end up here if the kernel mapping and the fixmap
>>> +              * share the top level pgd entry, which should only happen on
>>> +              * 16k/4 levels configurations.
>>> +              */
>>> +             BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
>>> +             pud = pud_offset_kimg(pgd, addr);
>>> +             memblock_free(__pa(bm_pud), sizeof(bm_pud));
>>> +     } else {
>>> +             pgd_populate(&init_mm, pgd, bm_pud);
>>> +             pud = fixmap_pud(addr);
>>> +     }
>>>       pud_populate(&init_mm, pud, bm_pmd);
>>>       pmd = fixmap_pmd(addr);
>>>       pmd_populate_kernel(&init_mm, pmd, bm_pte);
>>> --
>>> 2.5.0
>>>
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.