Download raw body.
fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths
Syzkaller found another[1] juicy one! I love syzkaller. <3
There's a race between a caller in the VMM_IOC_CREATE and VMM_IOC_TERM
paths. A window exists where one caller can be initializing a vcpu,
requiring things like VMX mode to be active on the host cpu, while
another caller is tearing down the only currently active vm triggering a
disabling of virtualization mode on host cpus.
On an intel machine, this means the caller in the VMM_IOC_CREATE path
has the current cpu interrupted via ipi and it executes VMXOFF, leaving
VMX mode. When initializing the vcpu and issuing a VMPTRLD instruction
to load the VMCS, a #UD occurs.
The root cause is how vmm(4) uses a global count of vm's to decide
"there are no more vm's, so let's disable virtualization extensions" and
does the IPI broadcast dance to issue things like VMXOFF, etc. This
value is incremented too late in the creation path. Locks *are* used to
guard mutating these counters together, but this isn't a locking issue.
Reproduction is trivial with a strategic sleep and a small program:
1. Insert a tsleep_nsec(9) call for 2-3 seconds in vmm.c's vm_create()
right before vcpu_init() is called. Build/install/boot kernel.
2. In a test program and without vmd(8) running / no vm's:
a. Create a vm via VMM_IOC_CREATE.
b. Fork the program or create a thread.
c. Have one thread issue a 2nd vm creation while the other issues
a VMM_IOC_TERM for the first vm.
The approach this diff takes is to pull the vcpu and vm count
incrementing up to the top of vm_create(). This effectively "reserves"
capacity early instead of just checking if we're maxxed out. A
consequence is the error handling paths need to decrement these
counters.
ok?
(side note: ddb wrongly decodes this as RDRAND as they share instruction
bytes, but VMPTRLD does't use a REX.W prefix...a different issue to fix
later.)
[1]: https://syzkaller.appspot.com/bug?extid=2d0f1b4cf10a3ff5cebb
diffstat refs/heads/master refs/heads/vm-terminate-race
M sys/dev/vmm/vmm.c | 33+ 35-
1 file changed, 33 insertions(+), 35 deletions(-)
diff refs/heads/master refs/heads/vm-terminate-race
commit - 83ba03cfbffe1c8f7a1de7e5cbd9b2a810c0eec0
commit + 62c01369fa1012e5538c8a67aa4117224ae11f58
blob - 38ab8473e8670ae871d850e3b28bc3d09cfefb22
blob + 6e123e1b197f20679dfa9fdedbf14cef93661d95
--- sys/dev/vmm/vmm.c
+++ sys/dev/vmm/vmm.c
@@ -355,13 +355,14 @@ vm_find_vcpu(struct vm *vm, uint32_t id)
int
vm_create(struct vm_create_params *vcp, struct proc *p)
{
- int i, ret;
+ int i, ret = EINVAL;
size_t memsize;
struct vm *vm;
struct vcpu *vcpu;
struct uvm_object *uao;
struct vm_mem_range *vmr;
unsigned int uvmflags = 0;
+ uint32_t vm_id = 0;
memsize = vm_create_check_mem_ranges(vcp);
if (memsize == 0)
@@ -371,20 +372,27 @@ vm_create(struct vm_create_params *vcp, struct proc *p
if (vcp->vcp_ncpus != 1)
return (EINVAL);
- /* Bail early if we're already at vcpu capacity. */
- rw_enter_read(&vmm_softc->vm_lock);
+ /*
+ * Increment global counts early to see if the capacity limits
+ * would be violated and prevent vmm(4) from disabling any
+ * virtualization extensions on the host while creating a vm.
+ */
+ rw_enter_write(&vmm_softc->vm_lock);
if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) {
DPRINTF("%s: maximum vcpus (%lu) reached\n", __func__,
vmm_softc->vcpu_max);
- rw_exit_read(&vmm_softc->vm_lock);
+ rw_exit_write(&vmm_softc->vm_lock);
return (ENOMEM);
}
- rw_exit_read(&vmm_softc->vm_lock);
+ vmm_softc->vcpu_ct += vcp->vcp_ncpus;
+ vm_id = vmm_softc->vm_ct++;
+ rw_exit_write(&vmm_softc->vm_lock);
/* Instantiate and configure the new vm. */
vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
/* Create the VM's identity. */
+ vm->vm_id = vm_id;
vm->vm_creator_pid = p->p_p->ps_pid;
strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
@@ -409,8 +417,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
if (uao == NULL) {
printf("%s: failed to initialize memory slot\n",
__func__);
- vm_teardown(&vm);
- return (ENOMEM);
+ ret = ENOMEM;
+ goto err;
}
/* Map the UVM aobj into the process. It owns this reference. */
@@ -419,8 +427,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
if (ret) {
printf("%s: uvm_map failed: %d\n", __func__, ret);
uao_detach(uao);
- vm_teardown(&vm);
- return (ENOMEM);
+ ret = ENOMEM;
+ goto err;
}
/* Make this mapping immutable so userland cannot change it. */
@@ -431,8 +439,7 @@ vm_create(struct vm_create_params *vcp, struct proc *p
ret);
uvm_unmap(&p->p_vmspace->vm_map, vmr->vmr_va,
vmr->vmr_va + vmr->vmr_size);
- vm_teardown(&vm);
- return (ret);
+ goto err;
}
uao_reference(uao); /* Take a reference for vmm. */
@@ -441,8 +448,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
if (vm_impl_init(vm, p)) {
printf("failed to init arch-specific features for vm %p\n", vm);
- vm_teardown(&vm);
- return (ENOMEM);
+ ret = ENOMEM;
+ goto err;
}
vm->vm_vcpu_ct = 0;
@@ -455,47 +462,38 @@ vm_create(struct vm_create_params *vcp, struct proc *p
vcpu->vc_parent = vm;
vcpu->vc_id = vm->vm_vcpu_ct;
vm->vm_vcpu_ct++;
+
if ((ret = vcpu_init(vcpu, vcp)) != 0) {
printf("failed to init vcpu %d for vm %p\n", i, vm);
pool_put(&vcpu_pool, vcpu);
- vm_teardown(&vm);
- return (ret);
+ goto err;
}
/* Publish vcpu to list, inheriting the reference. */
SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
}
- /* Attempt to register the vm now that it's configured. */
- rw_enter_write(&vmm_softc->vm_lock);
-
- if (vmm_softc->vcpu_ct + vm->vm_vcpu_ct > vmm_softc->vcpu_max) {
- /* Someone already took our capacity. */
- printf("%s: maximum vcpus (%lu) reached\n", __func__,
- vmm_softc->vcpu_max);
- rw_exit_write(&vmm_softc->vm_lock);
- vm_teardown(&vm);
- return (ENOMEM);
- }
-
- /* Update the global index and identify the vm. */
- vmm_softc->vm_idx++;
- vm->vm_id = vmm_softc->vm_idx;
+ /* Identify the vm. */
vcp->vcp_id = vm->vm_id;
/* Publish the vm into the list and update counts. */
- vm->vm_dying = 0;
+ rw_enter_write(&vmm_softc->vm_lock);
refcnt_init(&vm->vm_refcnt);
SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
- vmm_softc->vm_ct++;
- vmm_softc->vcpu_ct += vm->vm_vcpu_ct;
+ rw_exit_write(&vmm_softc->vm_lock);
/* Update the userland process's view of guest memory. */
memcpy(vcp->vcp_memranges, vm->vm_memranges,
vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0]));
- rw_exit_write(&vmm_softc->vm_lock);
-
return (0);
+
+err:
+ vm_teardown(&vm);
+ rw_enter_write(&vmm_softc->vm_lock);
+ vmm_softc->vm_idx--;
+ vmm_softc->vcpu_ct -= vcp->vcp_ncpus;
+ rw_exit_write(&vmm_softc->vm_lock);
+ return (ret);
}
/*
fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths