From: Dave Voutila Subject: fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths To: tech@openbsd.org Cc: mlarkin@openbsd.org Date: Thu, 18 Dec 2025 10:45:57 -0500 Syzkaller found another[1] juicy one! I love syzkaller. <3 There's a race between a caller in the VMM_IOC_CREATE and VMM_IOC_TERM paths. A window exists where one caller can be initializing a vcpu, requiring things like VMX mode to be active on the host cpu, while another caller is tearing down the only currently active vm triggering a disabling of virtualization mode on host cpus. On an intel machine, this means the caller in the VMM_IOC_CREATE path has the current cpu interrupted via ipi and it executes VMXOFF, leaving VMX mode. When initializing the vcpu and issuing a VMPTRLD instruction to load the VMCS, a #UD occurs. The root cause is how vmm(4) uses a global count of vm's to decide "there are no more vm's, so let's disable virtualization extensions" and does the IPI broadcast dance to issue things like VMXOFF, etc. This value is incremented too late in the creation path. Locks *are* used to guard mutating these counters together, but this isn't a locking issue. Reproduction is trivial with a strategic sleep and a small program: 1. Insert a tsleep_nsec(9) call for 2-3 seconds in vmm.c's vm_create() right before vcpu_init() is called. Build/install/boot kernel. 2. In a test program and without vmd(8) running / no vm's: a. Create a vm via VMM_IOC_CREATE. b. Fork the program or create a thread. c. Have one thread issue a 2nd vm creation while the other issues a VMM_IOC_TERM for the first vm. The approach this diff takes is to pull the vcpu and vm count incrementing up to the top of vm_create(). This effectively "reserves" capacity early instead of just checking if we're maxxed out. A consequence is the error handling paths need to decrement these counters. ok? (side note: ddb wrongly decodes this as RDRAND as they share instruction bytes, but VMPTRLD does't use a REX.W prefix...a different issue to fix later.) [1]: https://syzkaller.appspot.com/bug?extid=2d0f1b4cf10a3ff5cebb diffstat refs/heads/master refs/heads/vm-terminate-race M sys/dev/vmm/vmm.c | 33+ 35- 1 file changed, 33 insertions(+), 35 deletions(-) diff refs/heads/master refs/heads/vm-terminate-race commit - 83ba03cfbffe1c8f7a1de7e5cbd9b2a810c0eec0 commit + 62c01369fa1012e5538c8a67aa4117224ae11f58 blob - 38ab8473e8670ae871d850e3b28bc3d09cfefb22 blob + 6e123e1b197f20679dfa9fdedbf14cef93661d95 --- sys/dev/vmm/vmm.c +++ sys/dev/vmm/vmm.c @@ -355,13 +355,14 @@ vm_find_vcpu(struct vm *vm, uint32_t id) int vm_create(struct vm_create_params *vcp, struct proc *p) { - int i, ret; + int i, ret = EINVAL; size_t memsize; struct vm *vm; struct vcpu *vcpu; struct uvm_object *uao; struct vm_mem_range *vmr; unsigned int uvmflags = 0; + uint32_t vm_id = 0; memsize = vm_create_check_mem_ranges(vcp); if (memsize == 0) @@ -371,20 +372,27 @@ vm_create(struct vm_create_params *vcp, struct proc *p if (vcp->vcp_ncpus != 1) return (EINVAL); - /* Bail early if we're already at vcpu capacity. */ - rw_enter_read(&vmm_softc->vm_lock); + /* + * Increment global counts early to see if the capacity limits + * would be violated and prevent vmm(4) from disabling any + * virtualization extensions on the host while creating a vm. + */ + rw_enter_write(&vmm_softc->vm_lock); if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) { DPRINTF("%s: maximum vcpus (%lu) reached\n", __func__, vmm_softc->vcpu_max); - rw_exit_read(&vmm_softc->vm_lock); + rw_exit_write(&vmm_softc->vm_lock); return (ENOMEM); } - rw_exit_read(&vmm_softc->vm_lock); + vmm_softc->vcpu_ct += vcp->vcp_ncpus; + vm_id = vmm_softc->vm_ct++; + rw_exit_write(&vmm_softc->vm_lock); /* Instantiate and configure the new vm. */ vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO); /* Create the VM's identity. */ + vm->vm_id = vm_id; vm->vm_creator_pid = p->p_p->ps_pid; strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1); @@ -409,8 +417,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p if (uao == NULL) { printf("%s: failed to initialize memory slot\n", __func__); - vm_teardown(&vm); - return (ENOMEM); + ret = ENOMEM; + goto err; } /* Map the UVM aobj into the process. It owns this reference. */ @@ -419,8 +427,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p if (ret) { printf("%s: uvm_map failed: %d\n", __func__, ret); uao_detach(uao); - vm_teardown(&vm); - return (ENOMEM); + ret = ENOMEM; + goto err; } /* Make this mapping immutable so userland cannot change it. */ @@ -431,8 +439,7 @@ vm_create(struct vm_create_params *vcp, struct proc *p ret); uvm_unmap(&p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_va + vmr->vmr_size); - vm_teardown(&vm); - return (ret); + goto err; } uao_reference(uao); /* Take a reference for vmm. */ @@ -441,8 +448,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p if (vm_impl_init(vm, p)) { printf("failed to init arch-specific features for vm %p\n", vm); - vm_teardown(&vm); - return (ENOMEM); + ret = ENOMEM; + goto err; } vm->vm_vcpu_ct = 0; @@ -455,47 +462,38 @@ vm_create(struct vm_create_params *vcp, struct proc *p vcpu->vc_parent = vm; vcpu->vc_id = vm->vm_vcpu_ct; vm->vm_vcpu_ct++; + if ((ret = vcpu_init(vcpu, vcp)) != 0) { printf("failed to init vcpu %d for vm %p\n", i, vm); pool_put(&vcpu_pool, vcpu); - vm_teardown(&vm); - return (ret); + goto err; } /* Publish vcpu to list, inheriting the reference. */ SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link); } - /* Attempt to register the vm now that it's configured. */ - rw_enter_write(&vmm_softc->vm_lock); - - if (vmm_softc->vcpu_ct + vm->vm_vcpu_ct > vmm_softc->vcpu_max) { - /* Someone already took our capacity. */ - printf("%s: maximum vcpus (%lu) reached\n", __func__, - vmm_softc->vcpu_max); - rw_exit_write(&vmm_softc->vm_lock); - vm_teardown(&vm); - return (ENOMEM); - } - - /* Update the global index and identify the vm. */ - vmm_softc->vm_idx++; - vm->vm_id = vmm_softc->vm_idx; + /* Identify the vm. */ vcp->vcp_id = vm->vm_id; /* Publish the vm into the list and update counts. */ - vm->vm_dying = 0; + rw_enter_write(&vmm_softc->vm_lock); refcnt_init(&vm->vm_refcnt); SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link); - vmm_softc->vm_ct++; - vmm_softc->vcpu_ct += vm->vm_vcpu_ct; + rw_exit_write(&vmm_softc->vm_lock); /* Update the userland process's view of guest memory. */ memcpy(vcp->vcp_memranges, vm->vm_memranges, vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0])); - rw_exit_write(&vmm_softc->vm_lock); - return (0); + +err: + vm_teardown(&vm); + rw_enter_write(&vmm_softc->vm_lock); + vmm_softc->vm_idx--; + vmm_softc->vcpu_ct -= vcp->vcp_ncpus; + rw_exit_write(&vmm_softc->vm_lock); + return (ret); } /*