From: Dave Voutila Subject: teach EPT pmaps how to use pv_lists To: tech@openbsd.org Cc: Mike Larkin Date: Fri, 04 Oct 2024 10:27:55 -0400 Looking for some testing (of vmm/vmd on Intel hosts) and review of the following diff. In short, the objective is to incrementally bring EPT pmap operations more in line with regular amd64 and AMD RVI pmaps. This diffs: - updates pmap_enter_ept() to add pv_entry's when inserting a pte in an EPT pmap - updates pmap_do_remove_ept() to support removing pv_entry's when removing a pte from an EPT pmap - updates pmap_page_remove() to support zapping an EPT pte if a given UVM vm_page is being removed and has a pv_entry pointing to an EPT pte For anyone reviewing: - I needed to change the PG_PVLIST bit so it would still be valid in an EPT pte, so I just swapped for PG_AVAIL3 as it looks available for use in EPT ptes - Validity checks on EPT ptes are not currently performed. For correctness, one cannot just use the same PG_V bit as EPT ptes are special and support true X-only permissions (in addition to breaking out R from W). That should be a different diff IMO. I'm not a big fan of throwing more junk directly into pmap_page_remove(), but given how it's designed it feels like the most sane place to put the logic for now. diffstat refs/heads/master refs/heads/ept-pvlist-3 M sys/arch/amd64/amd64/pmap.c | 90+ 14- M sys/arch/amd64/include/pmap.h | 2+ 2- 2 files changed, 92 insertions(+), 16 deletions(-) diff refs/heads/master refs/heads/ept-pvlist-3 commit - 3c9ccb50785e1c693c25fd18c467a46499c73493 commit + 1fd874b9a12aadcaf92afdc5c9caa94bb5970c27 blob - 35af9d10cdc6cb79f6288b964bc350a46bfce8d6 blob + 4e843b19983e2a3c277c8cc9aea8165746c80aea --- sys/arch/amd64/amd64/pmap.c +++ sys/arch/amd64/amd64/pmap.c @@ -336,8 +336,8 @@ struct pv_entry *pmap_remove_pv(struct vm_page *, stru void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); #if NVMM > 0 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); -void pmap_do_remove_ept(struct pmap *, vaddr_t); -int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); +void pmap_do_remove_ept(struct pmap *, vaddr_t, struct pv_entry **); +int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t, int); void pmap_shootept(struct pmap *, int); #endif /* NVMM > 0 */ int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, @@ -1970,6 +1970,39 @@ pmap_page_remove(struct vm_page *pg) pm = pve->pv_pmap; mtx_leave(&pg->mdpage.pv_mtx); +#if NVMM > 0 + if (pm->pm_type == PMAP_TYPE_EPT) { + mtx_enter(&pm->pm_mtx); + + /* + * See note below about dropping pvlist lock. + */ + mtx_enter(&pg->mdpage.pv_mtx); + if ((pve = pg->mdpage.pv_list) == NULL || + pve->pv_pmap != pm) { + mtx_leave(&pg->mdpage.pv_mtx); + mtx_leave(&pm->pm_mtx); + pmap_destroy(pm); + mtx_enter(&pg->mdpage.pv_mtx); + } + + /* Detach the pv_entry from the pv_list. */ + pg->mdpage.pv_list = pve->pv_next; + mtx_leave(&pg->mdpage.pv_mtx); + + /* Zap it. */ + pmap_do_remove_ept(pm, pve->pv_va, NULL); + pmap_shootept(pm, 1); + + /* Clean up before next iteration. */ + mtx_leave(&pm->pm_mtx); + pmap_destroy(pm); + pool_put(&pmap_pv_pool, pve); + mtx_enter(&pg->mdpage.pv_mtx); + continue; + } +#endif /* NVMM > 0 */ + /* XXX use direct map? */ scr3 = pmap_map_ptes(pm); /* locks pmap */ shootself = (scr3 == 0); @@ -2460,29 +2493,36 @@ void pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) { vaddr_t v; + struct pv_entry *pve, *free_pvs = NULL; mtx_enter(&pmap->pm_mtx); DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa, (uint64_t)egpa); for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) - pmap_do_remove_ept(pmap, v); + pmap_do_remove_ept(pmap, v, &free_pvs); pmap_shootept(pmap, 1); mtx_leave(&pmap->pm_mtx); - pmap_tlb_shootwait(); + + while ((pve = free_pvs) != NULL) { + free_pvs = pve->pv_next; + pool_put(&pmap_pv_pool, pve); + } } void -pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) +pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa, struct pv_entry **free_pvs) { uint64_t l4idx, l3idx, l2idx, l1idx; - struct vm_page *pg3, *pg2, *pg1; + struct vm_page *pg, *pg3, *pg2, *pg1; + struct pv_entry *pve; paddr_t npa3, npa2, npa1; pd_entry_t *pd4, *pd3, *pd2, *pd1; pd_entry_t *pptes; + pt_entry_t opte; MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); @@ -2518,10 +2558,23 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1); pg1 = PHYS_TO_VM_PAGE(npa1); - if (pd1[l1idx] == 0) + opte = pmap_pte_set(&pd1[l1idx], 0); + if (opte == 0) // XXX check using an EPT valid bit. return; - pd1[l1idx] = 0; + if (opte & PG_PVLIST) { + pg = PHYS_TO_VM_PAGE(opte & pg_frame); + if (pg == NULL) + panic("%s: PG_PVLIST mapping with unmanaged page: gpa " + "0x%lx, opte 0x%llx, hpa 0x%llx", __func__, gpa, + opte, opte & pg_frame); + pve = pmap_remove_pv(pg, pmap, gpa); + if (pve != NULL && free_pvs != NULL) { + pve->pv_next = *free_pvs; + *free_pvs = pve; + } + } + pg1->wire_count--; pmap->pm_stats.resident_count--; @@ -2556,11 +2609,14 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) } int -pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot) +pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot, + int flags) { uint64_t l4idx, l3idx, l2idx, l1idx; - pd_entry_t *pd, npte; - struct vm_page *ptp, *pptp; + pd_entry_t *pd; + pt_entry_t opte, npte; + struct vm_page *pg, *ptp, *pptp; + struct pv_entry *pve; paddr_t npa; struct uvm_object *obj; int ret = 0; @@ -2573,6 +2629,13 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ + pve = pool_get(&pmap_pv_pool, PR_NOWAIT); + if (pve == NULL) { + if (flags & PMAP_CANFAIL) + return ENOMEM; + panic("%s: no pv entries available", __func__); + } + mtx_enter(&pmap->pm_mtx); /* Start at PML4 / top level */ @@ -2720,11 +2783,21 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t if (prot & PROT_EXEC) npte |= EPT_X; - if (pd[l1idx] == 0) { + opte = pd[l1idx]; // XXX check an EPT valid bit + if (opte == 0) { ptp->wire_count++; pmap->pm_stats.resident_count++; + + pg = PHYS_TO_VM_PAGE(hpa); + if (pg != NULL) { + pmap_enter_pv(pg, pve, pmap, gpa, ptp); + npte |= PG_PVLIST; + pve = NULL; + } else + panic("%s: cannot find vm page", __func__); } else { - /* XXX flush ept */ + pmap_shootept(pmap, 1); + pmap_tlb_shootwait(); } pd[l1idx] = npte; @@ -2732,6 +2805,9 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t unlock: mtx_leave(&pmap->pm_mtx); + if (pve != NULL) + pool_put(&pmap_pv_pool, pve); + return ret; } #endif /* NVMM > 0 */ @@ -2758,7 +2834,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, #if NVMM > 0 if (pmap_is_ept(pmap)) - return pmap_enter_ept(pmap, va, pa, prot); + return pmap_enter_ept(pmap, va, pa, prot, flags); #endif /* NVMM > 0 */ KASSERT(!(wc && nocache)); blob - cc358eab1e93376f4d0b511d0c0bcd06aa476b40 blob + 6eeab997123e6d10a9dddde3f55bf25950f43978 --- sys/arch/amd64/include/pmap.h +++ sys/arch/amd64/include/pmap.h @@ -245,8 +245,8 @@ */ #define PG_W PG_AVAIL1 /* "wired" mapping */ -#define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */ -/* PG_AVAIL3 not used */ +/* PG_AVAIL2 not used */ +#define PG_PVLIST PG_AVAIL3 /* mapping has entry on pvlist */ /* * PCID assignments.