Index | Thread | Search

From:
Mike Larkin <mlarkin@nested.page>
Subject:
Re: vmm/vmx: update cr3 and flush ept (invept) on cpu migration
To:
Dave Voutila <dv@sisu.io>
Cc:
OpenBSD Tech <tech@openbsd.org>, Philip Guenther <guenther@openbsd.org>
Date:
Sat, 13 Jul 2024 13:46:56 -0700

Download raw body.

Thread
On Sat, Jul 13, 2024 at 09:48:13AM -0400, Dave Voutila wrote:
> I'm going to try hard to jinx myself and say this is the while whale
> i've been hunting since Bruges two years ago. Regardless, I think this
> diff is something we want for correctness.
>
>  - we need to update the host cr3 in the vmcs to prevent vmx restoring
>    a stale cr3 if we migrate cpus while in the vcpu run loop. (most
>    likely reason this happens if if we sleep in uvm_fault(9).) We will
>    have a bullshit PCID if that happens.
>
>  - we need to invalidate cached EPT mappings if we change the cpu as
>    well. (No need yet for shootdown as we're not MP.) It's possible to
>    recycle pdp page backing the EPT, so the cpu could have a cached EPT
>    for a now-dead vm vcpu.
>
> Since INVEPT has two main modes (single vs global), adds a check via MSR
> to see if we can just do single context flush (i.e. the given ept). I
> liked the approach nvmm takes, so borrowed the idea of saving the mode
> to the vcpu object. (Not all Intel cpu's support "single" hence the MSR
> check.)
>
> I've tested this enough I'm confident it's correct. My confidence this
> is my whale and not a corpulent fish will increase over the next 24
> hours. (I don't use vmx_mprotect_ept, but did update it's use of invept
> to use the supported mode instead of assuming single context.)
>
> ok?
>

Nice! If you are happy with the resuits, the diff reads ok to me. Please
go ahead.

-ml

> diffstat ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
>  M  sys/arch/amd64/amd64/vmm_machdep.c   |  27+  12-
>  M  sys/arch/amd64/include/specialreg.h  |   2+   0-
>  M  sys/arch/amd64/include/vmmvar.h      |   1+   0-
>
> 3 files changed, 30 insertions(+), 12 deletions(-)
>
> diff ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
> commit - ba080721f3fe881c525a96bf32265471b142c251
> commit + f5b1b1d71e41e72923c85f5ff21e863de7b66cb6
> blob - f150569d9ea3e3cab4bbda818ac6f513d4a4014f
> blob + 3f0dfba5a5629fabe051325e6a3a67acbafba7c1
> --- sys/arch/amd64/amd64/vmm_machdep.c
> +++ sys/arch/amd64/amd64/vmm_machdep.c
> @@ -126,7 +126,7 @@ int svm_fault_page(struct vcpu *, paddr_t);
>  int vmx_fault_page(struct vcpu *, paddr_t);
>  int vmx_handle_np_fault(struct vcpu *);
>  int svm_handle_np_fault(struct vcpu *);
> -int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int);
> +int vmx_mprotect_ept(struct vcpu *, vm_map_t, paddr_t, paddr_t, int);
>  pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t);
>  int vmm_alloc_vpid(uint16_t *);
>  void vmm_free_vpid(uint16_t);
> @@ -777,7 +777,8 @@ vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
>  	}
>
>  	if (vmm_softc->mode == VMM_MODE_EPT)
> -		ret = vmx_mprotect_ept(vm->vm_map, sgpa, sgpa + size, prot);
> +		ret = vmx_mprotect_ept(vcpu, vm->vm_map, sgpa, sgpa + size,
> +		    prot);
>  	else if (vmm_softc->mode == VMM_MODE_RVI) {
>  		pmap_write_protect(vm->vm_map->pmap, sgpa, sgpa + size, prot);
>  		/* XXX requires a invlpga */
> @@ -799,7 +800,8 @@ out_nolock:
>   * required.
>   */
>  int
> -vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, int prot)
> +vmx_mprotect_ept(struct vcpu *vcpu, vm_map_t vm_map, paddr_t sgpa, paddr_t egpa,
> +    int prot)
>  {
>  	struct vmx_invept_descriptor vid;
>  	pmap_t pmap;
> @@ -859,7 +861,7 @@ vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_
>  		vid.vid_eptp = pmap->eptp;
>  		DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
>  		    vid.vid_eptp);
> -		invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
> +		invept(vcpu->vc_vmx_invept_op, &vid);
>  	}
>
>  	KERNEL_UNLOCK();
> @@ -2948,6 +2950,10 @@ vcpu_init_vmx(struct vcpu *vcpu)
>  		ret = EINVAL;
>  		goto exit;
>  	}
> +	if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT)
> +		vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_SINGLE_CTX;
> +	else
> +		vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_GLOBAL_CTX;
>
>  	if (msr & IA32_EPT_VPID_CAP_WB) {
>  		/* WB cache type supported */
> @@ -3896,6 +3902,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>  	struct schedstate_percpu *spc;
>  	struct vmx_msr_store *msr_store;
>  	struct vmx_invvpid_descriptor vid;
> +	struct vmx_invept_descriptor vid_ept;
>  	uint64_t cr0, eii, procbased, int_st;
>  	u_long s;
>
> @@ -3940,14 +3947,6 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>  	}
>  	memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit));
>
> -	/* Host CR3 */
> -	cr3 = rcr3();
> -	if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
> -		printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
> -		    VMCS_HOST_IA32_CR3, cr3);
> -		return (EINVAL);
> -	}
> -
>  	/* Handle vmd(8) injected interrupts */
>  	/* Is there an interrupt pending injection? */
>  	if (vcpu->vc_inject.vie_type == VCPU_INJECT_INTR) {
> @@ -4001,6 +4000,22 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>  			ci = curcpu();
>  			vcpu->vc_last_pcpu = ci;
>
> +			/* Invalidate EPT cache. */
> +			vid_ept.vid_reserved = 0;
> +			vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp;
> +			if (invept(vcpu->vc_vmx_invept_op, &vid_ept)) {
> +				printf("%s: invept\n", __func__);
> +				return (EINVAL);
> +			}
> +
> +			/* Host CR3 */
> +			cr3 = rcr3();
> +			if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
> +				printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
> +				    VMCS_HOST_IA32_CR3, cr3);
> +				return (EINVAL);
> +			}
> +
>  			setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
>  			if (gdt.rd_base == 0) {
>  				printf("%s: setregion\n", __func__);
> blob - 341b1df230940bb803789d4d77566075eb886f8a
> blob + 67f6bab09ae09e768b49ea31e1a28f7defc73b64
> --- sys/arch/amd64/include/specialreg.h
> +++ sys/arch/amd64/include/specialreg.h
> @@ -1117,6 +1117,8 @@
>  #define IA32_EPT_VPID_CAP_PAGE_WALK_4		(1ULL << 6)
>  #define IA32_EPT_VPID_CAP_WB			(1ULL << 14)
>  #define IA32_EPT_VPID_CAP_AD_BITS		(1ULL << 21)
> +#define IA32_EPT_VPID_CAP_INVEPT_CONTEXT	(1ULL << 25)
> +#define IA32_EPT_VPID_CAP_INVEPT_ALL		(1ULL << 26)
>
>  #define IA32_EPT_PAGING_CACHE_TYPE_UC	0x0
>  #define IA32_EPT_PAGING_CACHE_TYPE_WB	0x6
> blob - b67d3f0a7ba6de3297e3ace0b6531d65ef121f4b
> blob + 14e9b2dbfa5d55f6dab7ca04b3722665dbd7d21f
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -886,6 +886,7 @@ struct vcpu {
>  	uint32_t vc_vmx_vmcs_state;		/* [a] */
>  #define VMCS_CLEARED	0
>  #define VMCS_LAUNCHED	1
> +	uint64_t vc_vmx_invept_op;
>
>  	/* SVM only (all requiring [v]) */
>  	vaddr_t vc_svm_hsa_va;