From: Mike Larkin Subject: Re: vmm/vmx: update cr3 and flush ept (invept) on cpu migration To: Dave Voutila Cc: OpenBSD Tech , Philip Guenther Date: Sat, 13 Jul 2024 13:46:56 -0700 On Sat, Jul 13, 2024 at 09:48:13AM -0400, Dave Voutila wrote: > I'm going to try hard to jinx myself and say this is the while whale > i've been hunting since Bruges two years ago. Regardless, I think this > diff is something we want for correctness. > > - we need to update the host cr3 in the vmcs to prevent vmx restoring > a stale cr3 if we migrate cpus while in the vcpu run loop. (most > likely reason this happens if if we sleep in uvm_fault(9).) We will > have a bullshit PCID if that happens. > > - we need to invalidate cached EPT mappings if we change the cpu as > well. (No need yet for shootdown as we're not MP.) It's possible to > recycle pdp page backing the EPT, so the cpu could have a cached EPT > for a now-dead vm vcpu. > > Since INVEPT has two main modes (single vs global), adds a check via MSR > to see if we can just do single context flush (i.e. the given ept). I > liked the approach nvmm takes, so borrowed the idea of saving the mode > to the vcpu object. (Not all Intel cpu's support "single" hence the MSR > check.) > > I've tested this enough I'm confident it's correct. My confidence this > is my whale and not a corpulent fish will increase over the next 24 > hours. (I don't use vmx_mprotect_ept, but did update it's use of invept > to use the supported mode instead of assuming single context.) > > ok? > Nice! If you are happy with the resuits, the diff reads ok to me. Please go ahead. -ml > diffstat ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please > M sys/arch/amd64/amd64/vmm_machdep.c | 27+ 12- > M sys/arch/amd64/include/specialreg.h | 2+ 0- > M sys/arch/amd64/include/vmmvar.h | 1+ 0- > > 3 files changed, 30 insertions(+), 12 deletions(-) > > diff ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please > commit - ba080721f3fe881c525a96bf32265471b142c251 > commit + f5b1b1d71e41e72923c85f5ff21e863de7b66cb6 > blob - f150569d9ea3e3cab4bbda818ac6f513d4a4014f > blob + 3f0dfba5a5629fabe051325e6a3a67acbafba7c1 > --- sys/arch/amd64/amd64/vmm_machdep.c > +++ sys/arch/amd64/amd64/vmm_machdep.c > @@ -126,7 +126,7 @@ int svm_fault_page(struct vcpu *, paddr_t); > int vmx_fault_page(struct vcpu *, paddr_t); > int vmx_handle_np_fault(struct vcpu *); > int svm_handle_np_fault(struct vcpu *); > -int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int); > +int vmx_mprotect_ept(struct vcpu *, vm_map_t, paddr_t, paddr_t, int); > pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t); > int vmm_alloc_vpid(uint16_t *); > void vmm_free_vpid(uint16_t); > @@ -777,7 +777,8 @@ vm_mprotect_ept(struct vm_mprotect_ept_params *vmep) > } > > if (vmm_softc->mode == VMM_MODE_EPT) > - ret = vmx_mprotect_ept(vm->vm_map, sgpa, sgpa + size, prot); > + ret = vmx_mprotect_ept(vcpu, vm->vm_map, sgpa, sgpa + size, > + prot); > else if (vmm_softc->mode == VMM_MODE_RVI) { > pmap_write_protect(vm->vm_map->pmap, sgpa, sgpa + size, prot); > /* XXX requires a invlpga */ > @@ -799,7 +800,8 @@ out_nolock: > * required. > */ > int > -vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, int prot) > +vmx_mprotect_ept(struct vcpu *vcpu, vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, > + int prot) > { > struct vmx_invept_descriptor vid; > pmap_t pmap; > @@ -859,7 +861,7 @@ vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_ > vid.vid_eptp = pmap->eptp; > DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__, > vid.vid_eptp); > - invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid); > + invept(vcpu->vc_vmx_invept_op, &vid); > } > > KERNEL_UNLOCK(); > @@ -2948,6 +2950,10 @@ vcpu_init_vmx(struct vcpu *vcpu) > ret = EINVAL; > goto exit; > } > + if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT) > + vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_SINGLE_CTX; > + else > + vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_GLOBAL_CTX; > > if (msr & IA32_EPT_VPID_CAP_WB) { > /* WB cache type supported */ > @@ -3896,6 +3902,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * > struct schedstate_percpu *spc; > struct vmx_msr_store *msr_store; > struct vmx_invvpid_descriptor vid; > + struct vmx_invept_descriptor vid_ept; > uint64_t cr0, eii, procbased, int_st; > u_long s; > > @@ -3940,14 +3947,6 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * > } > memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit)); > > - /* Host CR3 */ > - cr3 = rcr3(); > - if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) { > - printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__, > - VMCS_HOST_IA32_CR3, cr3); > - return (EINVAL); > - } > - > /* Handle vmd(8) injected interrupts */ > /* Is there an interrupt pending injection? */ > if (vcpu->vc_inject.vie_type == VCPU_INJECT_INTR) { > @@ -4001,6 +4000,22 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * > ci = curcpu(); > vcpu->vc_last_pcpu = ci; > > + /* Invalidate EPT cache. */ > + vid_ept.vid_reserved = 0; > + vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp; > + if (invept(vcpu->vc_vmx_invept_op, &vid_ept)) { > + printf("%s: invept\n", __func__); > + return (EINVAL); > + } > + > + /* Host CR3 */ > + cr3 = rcr3(); > + if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) { > + printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__, > + VMCS_HOST_IA32_CR3, cr3); > + return (EINVAL); > + } > + > setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1); > if (gdt.rd_base == 0) { > printf("%s: setregion\n", __func__); > blob - 341b1df230940bb803789d4d77566075eb886f8a > blob + 67f6bab09ae09e768b49ea31e1a28f7defc73b64 > --- sys/arch/amd64/include/specialreg.h > +++ sys/arch/amd64/include/specialreg.h > @@ -1117,6 +1117,8 @@ > #define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6) > #define IA32_EPT_VPID_CAP_WB (1ULL << 14) > #define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21) > +#define IA32_EPT_VPID_CAP_INVEPT_CONTEXT (1ULL << 25) > +#define IA32_EPT_VPID_CAP_INVEPT_ALL (1ULL << 26) > > #define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0 > #define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6 > blob - b67d3f0a7ba6de3297e3ace0b6531d65ef121f4b > blob + 14e9b2dbfa5d55f6dab7ca04b3722665dbd7d21f > --- sys/arch/amd64/include/vmmvar.h > +++ sys/arch/amd64/include/vmmvar.h > @@ -886,6 +886,7 @@ struct vcpu { > uint32_t vc_vmx_vmcs_state; /* [a] */ > #define VMCS_CLEARED 0 > #define VMCS_LAUNCHED 1 > + uint64_t vc_vmx_invept_op; > > /* SVM only (all requiring [v]) */ > vaddr_t vc_svm_hsa_va;