Index | Thread | Search

From:
Mike Larkin <mlarkin@nested.page>
Subject:
Re: fix vmm(4) race resulting in #UD in VMM_IOC_CREATE paths
To:
Dave Voutila <dv@sisu.io>
Cc:
tech@openbsd.org, mlarkin@openbsd.org
Date:
Thu, 18 Dec 2025 08:03:11 -0800

Download raw body.

Thread
On Thu, Dec 18, 2025 at 10:45:57AM -0500, Dave Voutila wrote:
> Syzkaller found another[1] juicy one! I love syzkaller. <3
>
> There's a race between a caller in the VMM_IOC_CREATE and VMM_IOC_TERM
> paths. A window exists where one caller can be initializing a vcpu,
> requiring things like VMX mode to be active on the host cpu, while
> another caller is tearing down the only currently active vm triggering a
> disabling of virtualization mode on host cpus.
>
> On an intel machine, this means the caller in the VMM_IOC_CREATE path
> has the current cpu interrupted via ipi and it executes VMXOFF, leaving
> VMX mode. When initializing the vcpu and issuing a VMPTRLD instruction
> to load the VMCS, a #UD occurs.
>
> The root cause is how vmm(4) uses a global count of vm's to decide
> "there are no more vm's, so let's disable virtualization extensions" and
> does the IPI broadcast dance to issue things like VMXOFF, etc. This
> value is incremented too late in the creation path. Locks *are* used to
> guard mutating these counters together, but this isn't a locking issue.
>
> Reproduction is trivial with a strategic sleep and a small program:
>
>   1. Insert a tsleep_nsec(9) call for 2-3 seconds in vmm.c's vm_create()
>      right before vcpu_init() is called. Build/install/boot kernel.
>
>   2. In a test program and without vmd(8) running / no vm's:
>      a. Create a vm via VMM_IOC_CREATE.
>      b. Fork the program or create a thread.
>      c. Have one thread issue a 2nd vm creation while the other issues
>         a VMM_IOC_TERM for the first vm.
>
> The approach this diff takes is to pull the vcpu and vm count
> incrementing up to the top of vm_create(). This effectively "reserves"
> capacity early instead of just checking if we're maxxed out. A
> consequence is the error handling paths need to decrement these
> counters.
>
> ok?
>
> (side note: ddb wrongly decodes this as RDRAND as they share instruction
> bytes, but VMPTRLD does't use a REX.W prefix...a different issue to fix
> later.)
>
> [1]: https://syzkaller.appspot.com/bug?extid=2d0f1b4cf10a3ff5cebb
>

ok mlarkin, thanks!

>
> diffstat refs/heads/master refs/heads/vm-terminate-race
>  M  sys/dev/vmm/vmm.c  |  33+  35-
>
> 1 file changed, 33 insertions(+), 35 deletions(-)
>
> diff refs/heads/master refs/heads/vm-terminate-race
> commit - 83ba03cfbffe1c8f7a1de7e5cbd9b2a810c0eec0
> commit + 62c01369fa1012e5538c8a67aa4117224ae11f58
> blob - 38ab8473e8670ae871d850e3b28bc3d09cfefb22
> blob + 6e123e1b197f20679dfa9fdedbf14cef93661d95
> --- sys/dev/vmm/vmm.c
> +++ sys/dev/vmm/vmm.c
> @@ -355,13 +355,14 @@ vm_find_vcpu(struct vm *vm, uint32_t id)
>  int
>  vm_create(struct vm_create_params *vcp, struct proc *p)
>  {
> -	int i, ret;
> +	int i, ret = EINVAL;
>  	size_t memsize;
>  	struct vm *vm;
>  	struct vcpu *vcpu;
>  	struct uvm_object *uao;
>  	struct vm_mem_range *vmr;
>  	unsigned int uvmflags = 0;
> +	uint32_t vm_id = 0;
>
>  	memsize = vm_create_check_mem_ranges(vcp);
>  	if (memsize == 0)
> @@ -371,20 +372,27 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>  	if (vcp->vcp_ncpus != 1)
>  		return (EINVAL);
>
> -	/* Bail early if we're already at vcpu capacity. */
> -	rw_enter_read(&vmm_softc->vm_lock);
> +	/*
> +	 * Increment global counts early to see if the capacity limits
> +	 * would be violated and prevent vmm(4) from disabling any
> +	 * virtualization extensions on the host while creating a vm.
> +	 */
> +	rw_enter_write(&vmm_softc->vm_lock);
>  	if (vmm_softc->vcpu_ct + vcp->vcp_ncpus > vmm_softc->vcpu_max) {
>  		DPRINTF("%s: maximum vcpus (%lu) reached\n", __func__,
>  		    vmm_softc->vcpu_max);
> -		rw_exit_read(&vmm_softc->vm_lock);
> +		rw_exit_write(&vmm_softc->vm_lock);
>  		return (ENOMEM);
>  	}
> -	rw_exit_read(&vmm_softc->vm_lock);
> +	vmm_softc->vcpu_ct += vcp->vcp_ncpus;
> +	vm_id = vmm_softc->vm_ct++;
> +	rw_exit_write(&vmm_softc->vm_lock);
>
>  	/* Instantiate and configure the new vm. */
>  	vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
>
>  	/* Create the VM's identity. */
> +	vm->vm_id = vm_id;
>  	vm->vm_creator_pid = p->p_p->ps_pid;
>  	strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
>
> @@ -409,8 +417,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>  		if (uao == NULL) {
>  			printf("%s: failed to initialize memory slot\n",
>  			    __func__);
> -			vm_teardown(&vm);
> -			return (ENOMEM);
> +			ret = ENOMEM;
> +			goto err;
>  		}
>
>  		/* Map the UVM aobj into the process. It owns this reference. */
> @@ -419,8 +427,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>  		if (ret) {
>  			printf("%s: uvm_map failed: %d\n", __func__, ret);
>  			uao_detach(uao);
> -			vm_teardown(&vm);
> -			return (ENOMEM);
> +			ret = ENOMEM;
> +			goto err;
>  		}
>
>  		/* Make this mapping immutable so userland cannot change it. */
> @@ -431,8 +439,7 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>  			    ret);
>  			uvm_unmap(&p->p_vmspace->vm_map, vmr->vmr_va,
>  			    vmr->vmr_va + vmr->vmr_size);
> -			vm_teardown(&vm);
> -			return (ret);
> +			goto err;
>  		}
>
>  		uao_reference(uao);	/* Take a reference for vmm. */
> @@ -441,8 +448,8 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>
>  	if (vm_impl_init(vm, p)) {
>  		printf("failed to init arch-specific features for vm %p\n", vm);
> -		vm_teardown(&vm);
> -		return (ENOMEM);
> +		ret = ENOMEM;
> +		goto err;
>  	}
>
>  	vm->vm_vcpu_ct = 0;
> @@ -455,47 +462,38 @@ vm_create(struct vm_create_params *vcp, struct proc *p
>  		vcpu->vc_parent = vm;
>  		vcpu->vc_id = vm->vm_vcpu_ct;
>  		vm->vm_vcpu_ct++;
> +
>  		if ((ret = vcpu_init(vcpu, vcp)) != 0) {
>  			printf("failed to init vcpu %d for vm %p\n", i, vm);
>  			pool_put(&vcpu_pool, vcpu);
> -			vm_teardown(&vm);
> -			return (ret);
> +			goto err;
>  		}
>  		/* Publish vcpu to list, inheriting the reference. */
>  		SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
>  	}
>
> -	/* Attempt to register the vm now that it's configured. */
> -	rw_enter_write(&vmm_softc->vm_lock);
> -
> -	if (vmm_softc->vcpu_ct + vm->vm_vcpu_ct > vmm_softc->vcpu_max) {
> -		/* Someone already took our capacity. */
> -		printf("%s: maximum vcpus (%lu) reached\n", __func__,
> -		    vmm_softc->vcpu_max);
> -		rw_exit_write(&vmm_softc->vm_lock);
> -		vm_teardown(&vm);
> -		return (ENOMEM);
> -	}
> -
> -	/* Update the global index and identify the vm. */
> -	vmm_softc->vm_idx++;
> -	vm->vm_id = vmm_softc->vm_idx;
> +	/* Identify the vm. */
>  	vcp->vcp_id = vm->vm_id;
>
>  	/* Publish the vm into the list and update counts. */
> -	vm->vm_dying = 0;
> +	rw_enter_write(&vmm_softc->vm_lock);
>  	refcnt_init(&vm->vm_refcnt);
>  	SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
> -	vmm_softc->vm_ct++;
> -	vmm_softc->vcpu_ct += vm->vm_vcpu_ct;
> +	rw_exit_write(&vmm_softc->vm_lock);
>
>  	/* Update the userland process's view of guest memory. */
>  	memcpy(vcp->vcp_memranges, vm->vm_memranges,
>  	    vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0]));
>
> -	rw_exit_write(&vmm_softc->vm_lock);
> -
>  	return (0);
> +
> +err:
> +	vm_teardown(&vm);
> +	rw_enter_write(&vmm_softc->vm_lock);
> +	vmm_softc->vm_idx--;
> +	vmm_softc->vcpu_ct -= vcp->vcp_ncpus;
> +	rw_exit_write(&vmm_softc->vm_lock);
> +	return (ret);
>  }
>
>  /*