Index | Thread | Search

From:
Dave Voutila <dv@sisu.io>
Subject:
vmm/vmx: update cr3 and flush ept (invept) on cpu migration
To:
OpenBSD Tech <tech@openbsd.org>
Cc:
Mike Larkin <mlarkin@openbsd.org>, Philip Guenther <guenther@openbsd.org>
Date:
Sat, 13 Jul 2024 09:48:13 -0400

Download raw body.

Thread
I'm going to try hard to jinx myself and say this is the while whale
i've been hunting since Bruges two years ago. Regardless, I think this
diff is something we want for correctness.

 - we need to update the host cr3 in the vmcs to prevent vmx restoring
   a stale cr3 if we migrate cpus while in the vcpu run loop. (most
   likely reason this happens if if we sleep in uvm_fault(9).) We will
   have a bullshit PCID if that happens.

 - we need to invalidate cached EPT mappings if we change the cpu as
   well. (No need yet for shootdown as we're not MP.) It's possible to
   recycle pdp page backing the EPT, so the cpu could have a cached EPT
   for a now-dead vm vcpu.

Since INVEPT has two main modes (single vs global), adds a check via MSR
to see if we can just do single context flush (i.e. the given ept). I
liked the approach nvmm takes, so borrowed the idea of saving the mode
to the vcpu object. (Not all Intel cpu's support "single" hence the MSR
check.)

I've tested this enough I'm confident it's correct. My confidence this
is my whale and not a corpulent fish will increase over the next 24
hours. (I don't use vmx_mprotect_ept, but did update it's use of invept
to use the supported mode instead of assuming single context.)

ok?

diffstat ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
 M  sys/arch/amd64/amd64/vmm_machdep.c   |  27+  12-
 M  sys/arch/amd64/include/specialreg.h  |   2+   0-
 M  sys/arch/amd64/include/vmmvar.h      |   1+   0-

3 files changed, 30 insertions(+), 12 deletions(-)

diff ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
commit - ba080721f3fe881c525a96bf32265471b142c251
commit + f5b1b1d71e41e72923c85f5ff21e863de7b66cb6
blob - f150569d9ea3e3cab4bbda818ac6f513d4a4014f
blob + 3f0dfba5a5629fabe051325e6a3a67acbafba7c1
--- sys/arch/amd64/amd64/vmm_machdep.c
+++ sys/arch/amd64/amd64/vmm_machdep.c
@@ -126,7 +126,7 @@ int svm_fault_page(struct vcpu *, paddr_t);
 int vmx_fault_page(struct vcpu *, paddr_t);
 int vmx_handle_np_fault(struct vcpu *);
 int svm_handle_np_fault(struct vcpu *);
-int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int);
+int vmx_mprotect_ept(struct vcpu *, vm_map_t, paddr_t, paddr_t, int);
 pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t);
 int vmm_alloc_vpid(uint16_t *);
 void vmm_free_vpid(uint16_t);
@@ -777,7 +777,8 @@ vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
 	}

 	if (vmm_softc->mode == VMM_MODE_EPT)
-		ret = vmx_mprotect_ept(vm->vm_map, sgpa, sgpa + size, prot);
+		ret = vmx_mprotect_ept(vcpu, vm->vm_map, sgpa, sgpa + size,
+		    prot);
 	else if (vmm_softc->mode == VMM_MODE_RVI) {
 		pmap_write_protect(vm->vm_map->pmap, sgpa, sgpa + size, prot);
 		/* XXX requires a invlpga */
@@ -799,7 +800,8 @@ out_nolock:
  * required.
  */
 int
-vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, int prot)
+vmx_mprotect_ept(struct vcpu *vcpu, vm_map_t vm_map, paddr_t sgpa, paddr_t egpa,
+    int prot)
 {
 	struct vmx_invept_descriptor vid;
 	pmap_t pmap;
@@ -859,7 +861,7 @@ vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_
 		vid.vid_eptp = pmap->eptp;
 		DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
 		    vid.vid_eptp);
-		invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
+		invept(vcpu->vc_vmx_invept_op, &vid);
 	}

 	KERNEL_UNLOCK();
@@ -2948,6 +2950,10 @@ vcpu_init_vmx(struct vcpu *vcpu)
 		ret = EINVAL;
 		goto exit;
 	}
+	if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT)
+		vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_SINGLE_CTX;
+	else
+		vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_GLOBAL_CTX;

 	if (msr & IA32_EPT_VPID_CAP_WB) {
 		/* WB cache type supported */
@@ -3896,6 +3902,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
 	struct schedstate_percpu *spc;
 	struct vmx_msr_store *msr_store;
 	struct vmx_invvpid_descriptor vid;
+	struct vmx_invept_descriptor vid_ept;
 	uint64_t cr0, eii, procbased, int_st;
 	u_long s;

@@ -3940,14 +3947,6 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
 	}
 	memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit));

-	/* Host CR3 */
-	cr3 = rcr3();
-	if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
-		printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
-		    VMCS_HOST_IA32_CR3, cr3);
-		return (EINVAL);
-	}
-
 	/* Handle vmd(8) injected interrupts */
 	/* Is there an interrupt pending injection? */
 	if (vcpu->vc_inject.vie_type == VCPU_INJECT_INTR) {
@@ -4001,6 +4000,22 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
 			ci = curcpu();
 			vcpu->vc_last_pcpu = ci;

+			/* Invalidate EPT cache. */
+			vid_ept.vid_reserved = 0;
+			vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp;
+			if (invept(vcpu->vc_vmx_invept_op, &vid_ept)) {
+				printf("%s: invept\n", __func__);
+				return (EINVAL);
+			}
+
+			/* Host CR3 */
+			cr3 = rcr3();
+			if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
+				printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
+				    VMCS_HOST_IA32_CR3, cr3);
+				return (EINVAL);
+			}
+
 			setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
 			if (gdt.rd_base == 0) {
 				printf("%s: setregion\n", __func__);
blob - 341b1df230940bb803789d4d77566075eb886f8a
blob + 67f6bab09ae09e768b49ea31e1a28f7defc73b64
--- sys/arch/amd64/include/specialreg.h
+++ sys/arch/amd64/include/specialreg.h
@@ -1117,6 +1117,8 @@
 #define IA32_EPT_VPID_CAP_PAGE_WALK_4		(1ULL << 6)
 #define IA32_EPT_VPID_CAP_WB			(1ULL << 14)
 #define IA32_EPT_VPID_CAP_AD_BITS		(1ULL << 21)
+#define IA32_EPT_VPID_CAP_INVEPT_CONTEXT	(1ULL << 25)
+#define IA32_EPT_VPID_CAP_INVEPT_ALL		(1ULL << 26)

 #define IA32_EPT_PAGING_CACHE_TYPE_UC	0x0
 #define IA32_EPT_PAGING_CACHE_TYPE_WB	0x6
blob - b67d3f0a7ba6de3297e3ace0b6531d65ef121f4b
blob + 14e9b2dbfa5d55f6dab7ca04b3722665dbd7d21f
--- sys/arch/amd64/include/vmmvar.h
+++ sys/arch/amd64/include/vmmvar.h
@@ -886,6 +886,7 @@ struct vcpu {
 	uint32_t vc_vmx_vmcs_state;		/* [a] */
 #define VMCS_CLEARED	0
 #define VMCS_LAUNCHED	1
+	uint64_t vc_vmx_invept_op;

 	/* SVM only (all requiring [v]) */
 	vaddr_t vc_svm_hsa_va;