Download raw body.
fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths
On Thu, Dec 18, 2025 at 10:45:57AM -0500, Dave Voutila wrote:
> Syzkaller found another[1] juicy one! I love syzkaller. <3
>
> There's a race between a caller in the VMM_IOC_CREATE and VMM_IOC_TERM
> paths. A window exists where one caller can be initializing a vcpu,
> requiring things like VMX mode to be active on the host cpu, while
> another caller is tearing down the only currently active vm triggering a
> disabling of virtualization mode on host cpus.
>
> On an intel machine, this means the caller in the VMM_IOC_CREATE path
> has the current cpu interrupted via ipi and it executes VMXOFF, leaving
> VMX mode. When initializing the vcpu and issuing a VMPTRLD instruction
> to load the VMCS, a #UD occurs.
>
> The root cause is how vmm(4) uses a global count of vm's to decide
> "there are no more vm's, so let's disable virtualization extensions" and
> does the IPI broadcast dance to issue things like VMXOFF, etc. This
> value is incremented too late in the creation path. Locks *are* used to
> guard mutating these counters together, but this isn't a locking issue.
>
> Reproduction is trivial with a strategic sleep and a small program:
>
> 1. Insert a tsleep_nsec(9) call for 2-3 seconds in vmm.c's vm_create()
> right before vcpu_init() is called. Build/install/boot kernel.
>
> 2. In a test program and without vmd(8) running / no vm's:
> a. Create a vm via VMM_IOC_CREATE.
> b. Fork the program or create a thread.
> c. Have one thread issue a 2nd vm creation while the other issues
> a VMM_IOC_TERM for the first vm.
>
> The approach this diff takes is to pull the vcpu and vm count
> incrementing up to the top of vm_create(). This effectively "reserves"
> capacity early instead of just checking if we're maxxed out. A
> consequence is the error handling paths need to decrement these
> counters.
>
> ok?
>
> (side note: ddb wrongly decodes this as RDRAND as they share instruction
> bytes, but VMPTRLD does't use a REX.W prefix...a different issue to fix
> later.)
>
> [1]: https://syzkaller.appspot.com/bug?extid=2d0f1b4cf10a3ff5cebb
>
ok mlarkin, thanks!
>
> diffstat refs/heads/master refs/heads/vm-terminate-race
> M sys/dev/vmm/vmm.c | 33+ 35-
>
> 1 file changed, 33 insertions(+), 35 deletions(-)
>
> diff refs/heads/master refs/heads/vm-terminate-race
> commit - 83ba03cfbffe1c8f7a1de7e5cbd9b2a810c0eec0
> commit + 62c01369fa1012e5538c8a67aa4117224ae11f58
> blob - 38ab8473e8670ae871d850e3b28bc3d09cfefb22
> blob + 6e123e1b197f20679dfa9fdedbf14cef93661d95
> --- sys/dev/vmm/vmm.c
> +++ sys/dev/vmm/vmm.c
> @@ -355,13 +355,14 @@ vm_find_vcpu(struct vm *vm, uint32_t id)
> int
> vm_create(struct vm_create_params *vcp, struct proc *p)
> {
> - int i, ret;
> + int i, ret = EINVAL;
> size_t memsize;
> struct vm *vm;
> struct vcpu *vcpu;
> struct uvm_object *uao;
> struct vm_mem_range *vmr;
> unsigned int uvmflags = 0;
> + uint32_t vm_id = 0;
>
> memsize = vm_create_check_mem_ranges(vcp);
> if (memsize == 0)
> @@ -371,20 +372,27 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> if (vcp->vcp_ncpus != 1)
> return (EINVAL);
>
> - /* Bail early if we're already at vcpu capacity. */
> - rw_enter_read(&vmm_softc->vm_lock);
> + /*
> + * Increment global counts early to see if the capacity limits
> + * would be violated and prevent vmm(4) from disabling any
> + * virtualization extensions on the host while creating a vm.
> + */
> + rw_enter_write(&vmm_softc->vm_lock);
> if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) {
> DPRINTF("%s: maximum vcpus (%lu) reached\n", __func__,
> vmm_softc->vcpu_max);
> - rw_exit_read(&vmm_softc->vm_lock);
> + rw_exit_write(&vmm_softc->vm_lock);
> return (ENOMEM);
> }
> - rw_exit_read(&vmm_softc->vm_lock);
> + vmm_softc->vcpu_ct += vcp->vcp_ncpus;
> + vm_id = vmm_softc->vm_ct++;
> + rw_exit_write(&vmm_softc->vm_lock);
>
> /* Instantiate and configure the new vm. */
> vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
>
> /* Create the VM's identity. */
> + vm->vm_id = vm_id;
> vm->vm_creator_pid = p->p_p->ps_pid;
> strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
>
> @@ -409,8 +417,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> if (uao == NULL) {
> printf("%s: failed to initialize memory slot\n",
> __func__);
> - vm_teardown(&vm);
> - return (ENOMEM);
> + ret = ENOMEM;
> + goto err;
> }
>
> /* Map the UVM aobj into the process. It owns this reference. */
> @@ -419,8 +427,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> if (ret) {
> printf("%s: uvm_map failed: %d\n", __func__, ret);
> uao_detach(uao);
> - vm_teardown(&vm);
> - return (ENOMEM);
> + ret = ENOMEM;
> + goto err;
> }
>
> /* Make this mapping immutable so userland cannot change it. */
> @@ -431,8 +439,7 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> ret);
> uvm_unmap(&p->p_vmspace->vm_map, vmr->vmr_va,
> vmr->vmr_va + vmr->vmr_size);
> - vm_teardown(&vm);
> - return (ret);
> + goto err;
> }
>
> uao_reference(uao); /* Take a reference for vmm. */
> @@ -441,8 +448,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>
> if (vm_impl_init(vm, p)) {
> printf("failed to init arch-specific features for vm %p\n", vm);
> - vm_teardown(&vm);
> - return (ENOMEM);
> + ret = ENOMEM;
> + goto err;
> }
>
> vm->vm_vcpu_ct = 0;
> @@ -455,47 +462,38 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> vcpu->vc_parent = vm;
> vcpu->vc_id = vm->vm_vcpu_ct;
> vm->vm_vcpu_ct++;
> +
> if ((ret = vcpu_init(vcpu, vcp)) != 0) {
> printf("failed to init vcpu %d for vm %p\n", i, vm);
> pool_put(&vcpu_pool, vcpu);
> - vm_teardown(&vm);
> - return (ret);
> + goto err;
> }
> /* Publish vcpu to list, inheriting the reference. */
> SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
> }
>
> - /* Attempt to register the vm now that it's configured. */
> - rw_enter_write(&vmm_softc->vm_lock);
> -
> - if (vmm_softc->vcpu_ct + vm->vm_vcpu_ct > vmm_softc->vcpu_max) {
> - /* Someone already took our capacity. */
> - printf("%s: maximum vcpus (%lu) reached\n", __func__,
> - vmm_softc->vcpu_max);
> - rw_exit_write(&vmm_softc->vm_lock);
> - vm_teardown(&vm);
> - return (ENOMEM);
> - }
> -
> - /* Update the global index and identify the vm. */
> - vmm_softc->vm_idx++;
> - vm->vm_id = vmm_softc->vm_idx;
> + /* Identify the vm. */
> vcp->vcp_id = vm->vm_id;
>
> /* Publish the vm into the list and update counts. */
> - vm->vm_dying = 0;
> + rw_enter_write(&vmm_softc->vm_lock);
> refcnt_init(&vm->vm_refcnt);
> SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
> - vmm_softc->vm_ct++;
> - vmm_softc->vcpu_ct += vm->vm_vcpu_ct;
> + rw_exit_write(&vmm_softc->vm_lock);
>
> /* Update the userland process's view of guest memory. */
> memcpy(vcp->vcp_memranges, vm->vm_memranges,
> vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0]));
>
> - rw_exit_write(&vmm_softc->vm_lock);
> -
> return (0);
> +
> +err:
> + vm_teardown(&vm);
> + rw_enter_write(&vmm_softc->vm_lock);
> + vmm_softc->vm_idx--;
> + vmm_softc->vcpu_ct -= vcp->vcp_ncpus;
> + rw_exit_write(&vmm_softc->vm_lock);
> + return (ret);
> }
>
> /*
fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths