From: Mike Larkin Subject: Re: fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths To: Dave Voutila Cc: tech@openbsd.org, mlarkin@openbsd.org Date: Thu, 18 Dec 2025 08:03:11 -0800 On Thu, Dec 18, 2025 at 10:45:57AM -0500, Dave Voutila wrote: > Syzkaller found another[1] juicy one! I love syzkaller. <3 > > There's a race between a caller in the VMM_IOC_CREATE and VMM_IOC_TERM > paths. A window exists where one caller can be initializing a vcpu, > requiring things like VMX mode to be active on the host cpu, while > another caller is tearing down the only currently active vm triggering a > disabling of virtualization mode on host cpus. > > On an intel machine, this means the caller in the VMM_IOC_CREATE path > has the current cpu interrupted via ipi and it executes VMXOFF, leaving > VMX mode. When initializing the vcpu and issuing a VMPTRLD instruction > to load the VMCS, a #UD occurs. > > The root cause is how vmm(4) uses a global count of vm's to decide > "there are no more vm's, so let's disable virtualization extensions" and > does the IPI broadcast dance to issue things like VMXOFF, etc. This > value is incremented too late in the creation path. Locks *are* used to > guard mutating these counters together, but this isn't a locking issue. > > Reproduction is trivial with a strategic sleep and a small program: > > 1. Insert a tsleep_nsec(9) call for 2-3 seconds in vmm.c's vm_create() > right before vcpu_init() is called. Build/install/boot kernel. > > 2. In a test program and without vmd(8) running / no vm's: > a. Create a vm via VMM_IOC_CREATE. > b. Fork the program or create a thread. > c. Have one thread issue a 2nd vm creation while the other issues > a VMM_IOC_TERM for the first vm. > > The approach this diff takes is to pull the vcpu and vm count > incrementing up to the top of vm_create(). This effectively "reserves" > capacity early instead of just checking if we're maxxed out. A > consequence is the error handling paths need to decrement these > counters. > > ok? > > (side note: ddb wrongly decodes this as RDRAND as they share instruction > bytes, but VMPTRLD does't use a REX.W prefix...a different issue to fix > later.) > > [1]: https://syzkaller.appspot.com/bug?extid=2d0f1b4cf10a3ff5cebb > ok mlarkin, thanks! > > diffstat refs/heads/master refs/heads/vm-terminate-race > M sys/dev/vmm/vmm.c | 33+ 35- > > 1 file changed, 33 insertions(+), 35 deletions(-) > > diff refs/heads/master refs/heads/vm-terminate-race > commit - 83ba03cfbffe1c8f7a1de7e5cbd9b2a810c0eec0 > commit + 62c01369fa1012e5538c8a67aa4117224ae11f58 > blob - 38ab8473e8670ae871d850e3b28bc3d09cfefb22 > blob + 6e123e1b197f20679dfa9fdedbf14cef93661d95 > --- sys/dev/vmm/vmm.c > +++ sys/dev/vmm/vmm.c > @@ -355,13 +355,14 @@ vm_find_vcpu(struct vm *vm, uint32_t id) > int > vm_create(struct vm_create_params *vcp, struct proc *p) > { > - int i, ret; > + int i, ret = EINVAL; > size_t memsize; > struct vm *vm; > struct vcpu *vcpu; > struct uvm_object *uao; > struct vm_mem_range *vmr; > unsigned int uvmflags = 0; > + uint32_t vm_id = 0; > > memsize = vm_create_check_mem_ranges(vcp); > if (memsize == 0) > @@ -371,20 +372,27 @@ vm_create(struct vm_create_params *vcp, struct proc *p > if (vcp->vcp_ncpus != 1) > return (EINVAL); > > - /* Bail early if we're already at vcpu capacity. */ > - rw_enter_read(&vmm_softc->vm_lock); > + /* > + * Increment global counts early to see if the capacity limits > + * would be violated and prevent vmm(4) from disabling any > + * virtualization extensions on the host while creating a vm. > + */ > + rw_enter_write(&vmm_softc->vm_lock); > if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) { > DPRINTF("%s: maximum vcpus (%lu) reached\n", __func__, > vmm_softc->vcpu_max); > - rw_exit_read(&vmm_softc->vm_lock); > + rw_exit_write(&vmm_softc->vm_lock); > return (ENOMEM); > } > - rw_exit_read(&vmm_softc->vm_lock); > + vmm_softc->vcpu_ct += vcp->vcp_ncpus; > + vm_id = vmm_softc->vm_ct++; > + rw_exit_write(&vmm_softc->vm_lock); > > /* Instantiate and configure the new vm. */ > vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO); > > /* Create the VM's identity. */ > + vm->vm_id = vm_id; > vm->vm_creator_pid = p->p_p->ps_pid; > strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1); > > @@ -409,8 +417,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p > if (uao == NULL) { > printf("%s: failed to initialize memory slot\n", > __func__); > - vm_teardown(&vm); > - return (ENOMEM); > + ret = ENOMEM; > + goto err; > } > > /* Map the UVM aobj into the process. It owns this reference. */ > @@ -419,8 +427,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p > if (ret) { > printf("%s: uvm_map failed: %d\n", __func__, ret); > uao_detach(uao); > - vm_teardown(&vm); > - return (ENOMEM); > + ret = ENOMEM; > + goto err; > } > > /* Make this mapping immutable so userland cannot change it. */ > @@ -431,8 +439,7 @@ vm_create(struct vm_create_params *vcp, struct proc *p > ret); > uvm_unmap(&p->p_vmspace->vm_map, vmr->vmr_va, > vmr->vmr_va + vmr->vmr_size); > - vm_teardown(&vm); > - return (ret); > + goto err; > } > > uao_reference(uao); /* Take a reference for vmm. */ > @@ -441,8 +448,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p > > if (vm_impl_init(vm, p)) { > printf("failed to init arch-specific features for vm %p\n", vm); > - vm_teardown(&vm); > - return (ENOMEM); > + ret = ENOMEM; > + goto err; > } > > vm->vm_vcpu_ct = 0; > @@ -455,47 +462,38 @@ vm_create(struct vm_create_params *vcp, struct proc *p > vcpu->vc_parent = vm; > vcpu->vc_id = vm->vm_vcpu_ct; > vm->vm_vcpu_ct++; > + > if ((ret = vcpu_init(vcpu, vcp)) != 0) { > printf("failed to init vcpu %d for vm %p\n", i, vm); > pool_put(&vcpu_pool, vcpu); > - vm_teardown(&vm); > - return (ret); > + goto err; > } > /* Publish vcpu to list, inheriting the reference. */ > SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link); > } > > - /* Attempt to register the vm now that it's configured. */ > - rw_enter_write(&vmm_softc->vm_lock); > - > - if (vmm_softc->vcpu_ct + vm->vm_vcpu_ct > vmm_softc->vcpu_max) { > - /* Someone already took our capacity. */ > - printf("%s: maximum vcpus (%lu) reached\n", __func__, > - vmm_softc->vcpu_max); > - rw_exit_write(&vmm_softc->vm_lock); > - vm_teardown(&vm); > - return (ENOMEM); > - } > - > - /* Update the global index and identify the vm. */ > - vmm_softc->vm_idx++; > - vm->vm_id = vmm_softc->vm_idx; > + /* Identify the vm. */ > vcp->vcp_id = vm->vm_id; > > /* Publish the vm into the list and update counts. */ > - vm->vm_dying = 0; > + rw_enter_write(&vmm_softc->vm_lock); > refcnt_init(&vm->vm_refcnt); > SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link); > - vmm_softc->vm_ct++; > - vmm_softc->vcpu_ct += vm->vm_vcpu_ct; > + rw_exit_write(&vmm_softc->vm_lock); > > /* Update the userland process's view of guest memory. */ > memcpy(vcp->vcp_memranges, vm->vm_memranges, > vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0])); > > - rw_exit_write(&vmm_softc->vm_lock); > - > return (0); > + > +err: > + vm_teardown(&vm); > + rw_enter_write(&vmm_softc->vm_lock); > + vmm_softc->vm_idx--; > + vmm_softc->vcpu_ct -= vcp->vcp_ncpus; > + rw_exit_write(&vmm_softc->vm_lock); > + return (ret); > } > > /*