Download raw body.
vmm/vmx: update cr3 and flush ept (invept) on cpu migration
On Sat, Jul 13, 2024 at 09:48:13AM -0400, Dave Voutila wrote:
> I'm going to try hard to jinx myself and say this is the while whale
> i've been hunting since Bruges two years ago. Regardless, I think this
> diff is something we want for correctness.
>
> - we need to update the host cr3 in the vmcs to prevent vmx restoring
> a stale cr3 if we migrate cpus while in the vcpu run loop. (most
> likely reason this happens if if we sleep in uvm_fault(9).) We will
> have a bullshit PCID if that happens.
>
> - we need to invalidate cached EPT mappings if we change the cpu as
> well. (No need yet for shootdown as we're not MP.) It's possible to
> recycle pdp page backing the EPT, so the cpu could have a cached EPT
> for a now-dead vm vcpu.
>
> Since INVEPT has two main modes (single vs global), adds a check via MSR
> to see if we can just do single context flush (i.e. the given ept). I
> liked the approach nvmm takes, so borrowed the idea of saving the mode
> to the vcpu object. (Not all Intel cpu's support "single" hence the MSR
> check.)
>
> I've tested this enough I'm confident it's correct. My confidence this
> is my whale and not a corpulent fish will increase over the next 24
> hours. (I don't use vmx_mprotect_ept, but did update it's use of invept
> to use the supported mode instead of assuming single context.)
>
> ok?
>
Nice! If you are happy with the resuits, the diff reads ok to me. Please
go ahead.
-ml
> diffstat ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
> M sys/arch/amd64/amd64/vmm_machdep.c | 27+ 12-
> M sys/arch/amd64/include/specialreg.h | 2+ 0-
> M sys/arch/amd64/include/vmmvar.h | 1+ 0-
>
> 3 files changed, 30 insertions(+), 12 deletions(-)
>
> diff ba080721f3fe881c525a96bf32265471b142c251 refs/heads/please
> commit - ba080721f3fe881c525a96bf32265471b142c251
> commit + f5b1b1d71e41e72923c85f5ff21e863de7b66cb6
> blob - f150569d9ea3e3cab4bbda818ac6f513d4a4014f
> blob + 3f0dfba5a5629fabe051325e6a3a67acbafba7c1
> --- sys/arch/amd64/amd64/vmm_machdep.c
> +++ sys/arch/amd64/amd64/vmm_machdep.c
> @@ -126,7 +126,7 @@ int svm_fault_page(struct vcpu *, paddr_t);
> int vmx_fault_page(struct vcpu *, paddr_t);
> int vmx_handle_np_fault(struct vcpu *);
> int svm_handle_np_fault(struct vcpu *);
> -int vmx_mprotect_ept(vm_map_t, paddr_t, paddr_t, int);
> +int vmx_mprotect_ept(struct vcpu *, vm_map_t, paddr_t, paddr_t, int);
> pt_entry_t *vmx_pmap_find_pte_ept(pmap_t, paddr_t);
> int vmm_alloc_vpid(uint16_t *);
> void vmm_free_vpid(uint16_t);
> @@ -777,7 +777,8 @@ vm_mprotect_ept(struct vm_mprotect_ept_params *vmep)
> }
>
> if (vmm_softc->mode == VMM_MODE_EPT)
> - ret = vmx_mprotect_ept(vm->vm_map, sgpa, sgpa + size, prot);
> + ret = vmx_mprotect_ept(vcpu, vm->vm_map, sgpa, sgpa + size,
> + prot);
> else if (vmm_softc->mode == VMM_MODE_RVI) {
> pmap_write_protect(vm->vm_map->pmap, sgpa, sgpa + size, prot);
> /* XXX requires a invlpga */
> @@ -799,7 +800,8 @@ out_nolock:
> * required.
> */
> int
> -vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_t egpa, int prot)
> +vmx_mprotect_ept(struct vcpu *vcpu, vm_map_t vm_map, paddr_t sgpa, paddr_t egpa,
> + int prot)
> {
> struct vmx_invept_descriptor vid;
> pmap_t pmap;
> @@ -859,7 +861,7 @@ vmx_mprotect_ept(vm_map_t vm_map, paddr_t sgpa, paddr_
> vid.vid_eptp = pmap->eptp;
> DPRINTF("%s: flushing EPT TLB for EPTP 0x%llx\n", __func__,
> vid.vid_eptp);
> - invept(IA32_VMX_INVEPT_SINGLE_CTX, &vid);
> + invept(vcpu->vc_vmx_invept_op, &vid);
> }
>
> KERNEL_UNLOCK();
> @@ -2948,6 +2950,10 @@ vcpu_init_vmx(struct vcpu *vcpu)
> ret = EINVAL;
> goto exit;
> }
> + if (msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT)
> + vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_SINGLE_CTX;
> + else
> + vcpu->vc_vmx_invept_op = IA32_VMX_INVEPT_GLOBAL_CTX;
>
> if (msr & IA32_EPT_VPID_CAP_WB) {
> /* WB cache type supported */
> @@ -3896,6 +3902,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
> struct schedstate_percpu *spc;
> struct vmx_msr_store *msr_store;
> struct vmx_invvpid_descriptor vid;
> + struct vmx_invept_descriptor vid_ept;
> uint64_t cr0, eii, procbased, int_st;
> u_long s;
>
> @@ -3940,14 +3947,6 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
> }
> memset(&vcpu->vc_exit, 0, sizeof(vcpu->vc_exit));
>
> - /* Host CR3 */
> - cr3 = rcr3();
> - if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
> - printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
> - VMCS_HOST_IA32_CR3, cr3);
> - return (EINVAL);
> - }
> -
> /* Handle vmd(8) injected interrupts */
> /* Is there an interrupt pending injection? */
> if (vcpu->vc_inject.vie_type == VCPU_INJECT_INTR) {
> @@ -4001,6 +4000,22 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
> ci = curcpu();
> vcpu->vc_last_pcpu = ci;
>
> + /* Invalidate EPT cache. */
> + vid_ept.vid_reserved = 0;
> + vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp;
> + if (invept(vcpu->vc_vmx_invept_op, &vid_ept)) {
> + printf("%s: invept\n", __func__);
> + return (EINVAL);
> + }
> +
> + /* Host CR3 */
> + cr3 = rcr3();
> + if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
> + printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
> + VMCS_HOST_IA32_CR3, cr3);
> + return (EINVAL);
> + }
> +
> setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
> if (gdt.rd_base == 0) {
> printf("%s: setregion\n", __func__);
> blob - 341b1df230940bb803789d4d77566075eb886f8a
> blob + 67f6bab09ae09e768b49ea31e1a28f7defc73b64
> --- sys/arch/amd64/include/specialreg.h
> +++ sys/arch/amd64/include/specialreg.h
> @@ -1117,6 +1117,8 @@
> #define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6)
> #define IA32_EPT_VPID_CAP_WB (1ULL << 14)
> #define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21)
> +#define IA32_EPT_VPID_CAP_INVEPT_CONTEXT (1ULL << 25)
> +#define IA32_EPT_VPID_CAP_INVEPT_ALL (1ULL << 26)
>
> #define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0
> #define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6
> blob - b67d3f0a7ba6de3297e3ace0b6531d65ef121f4b
> blob + 14e9b2dbfa5d55f6dab7ca04b3722665dbd7d21f
> --- sys/arch/amd64/include/vmmvar.h
> +++ sys/arch/amd64/include/vmmvar.h
> @@ -886,6 +886,7 @@ struct vcpu {
> uint32_t vc_vmx_vmcs_state; /* [a] */
> #define VMCS_CLEARED 0
> #define VMCS_LAUNCHED 1
> + uint64_t vc_vmx_invept_op;
>
> /* SVM only (all requiring [v]) */
> vaddr_t vc_svm_hsa_va;
vmm/vmx: update cr3 and flush ept (invept) on cpu migration