From: Dave Voutila Subject: Re: teach EPT pmaps how to use pv_lists To: tech@openbsd.org Cc: Mike Larkin , Martin Pieuchot , Mischa Date: Sun, 08 Dec 2024 12:45:42 -0500 Dave Voutila writes: Updated diff below with changes suggested by mlarkin@. Testers and oks needed :D > Looking for some testing (of vmm/vmd on Intel hosts) and review of the > following diff. > > In short, the objective is to incrementally bring EPT pmap operations > more in line with regular amd64 and AMD RVI pmaps. This diffs: > > - updates pmap_enter_ept() to add pv_entry's when inserting a pte in an > EPT pmap > > - updates pmap_do_remove_ept() to support removing pv_entry's when > removing a pte from an EPT pmap > > - updates pmap_page_remove() to support zapping an EPT pte if a given > UVM vm_page is being removed and has a pv_entry pointing to an EPT pte > > For anyone reviewing: > > - I needed to change the PG_PVLIST bit so it would still be valid in an > EPT pte, so I just swapped for PG_AVAIL3 as it looks available for use > in EPT ptes > > - Validity checks on EPT ptes are not currently performed. For > correctness, one cannot just use the same PG_V bit as EPT ptes are > special and support true X-only permissions (in addition to breaking > out R from W). That should be a different diff IMO. > > I'm not a big fan of throwing more junk directly into > pmap_page_remove(), but given how it's designed it feels like the most > sane place to put the logic for now. > > diffstat refs/heads/master refs/heads/ept-pvlist-3 M sys/arch/amd64/amd64/pmap.c | 91+ 14- M sys/arch/amd64/include/pmap.h | 2+ 2- 2 files changed, 93 insertions(+), 16 deletions(-) diff refs/heads/master refs/heads/ept-pvlist-3 commit - e2c061ecb9e807859d3525b9e868ae5d227b3878 commit + 4791f1f2f408c10c66cde3d074c062c2185f2a2a blob - bfd667f99bf2dec1a468ca3d8d4f95bf34401026 blob + 01b8f6c2915c26427a9a1e8449fbaee452e5b17b --- sys/arch/amd64/amd64/pmap.c +++ sys/arch/amd64/amd64/pmap.c @@ -336,8 +336,8 @@ struct pv_entry *pmap_remove_pv(struct vm_page *, stru void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); #if NVMM > 0 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); -void pmap_do_remove_ept(struct pmap *, vaddr_t); -int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); +void pmap_do_remove_ept(struct pmap *, vaddr_t, struct pv_entry **); +int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t, int); void pmap_shootept(struct pmap *, int); #endif /* NVMM > 0 */ int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, @@ -1970,6 +1970,40 @@ pmap_page_remove(struct vm_page *pg) pm = pve->pv_pmap; mtx_leave(&pg->mdpage.pv_mtx); +#if NVMM > 0 + if (pm->pm_type == PMAP_TYPE_EPT) { + mtx_enter(&pm->pm_mtx); + + /* + * See note below about dropping pvlist lock. + */ + mtx_enter(&pg->mdpage.pv_mtx); + if ((pve = pg->mdpage.pv_list) == NULL || + pve->pv_pmap != pm) { + mtx_leave(&pg->mdpage.pv_mtx); + mtx_leave(&pm->pm_mtx); + pmap_destroy(pm); + mtx_enter(&pg->mdpage.pv_mtx); + continue; + } + + /* Detach the pv_entry from the pv_list. */ + pg->mdpage.pv_list = pve->pv_next; + mtx_leave(&pg->mdpage.pv_mtx); + + /* Zap it. */ + pmap_do_remove_ept(pm, pve->pv_va, NULL); + pmap_shootept(pm, 1); + + /* Clean up before next iteration. */ + mtx_leave(&pm->pm_mtx); + pmap_destroy(pm); + pool_put(&pmap_pv_pool, pve); + mtx_enter(&pg->mdpage.pv_mtx); + continue; + } +#endif /* NVMM > 0 */ + /* XXX use direct map? */ scr3 = pmap_map_ptes(pm); /* locks pmap */ shootself = (scr3 == 0); @@ -2439,29 +2473,36 @@ void pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) { vaddr_t v; + struct pv_entry *pve, *free_pvs = NULL; mtx_enter(&pmap->pm_mtx); DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa, (uint64_t)egpa); for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) - pmap_do_remove_ept(pmap, v); + pmap_do_remove_ept(pmap, v, &free_pvs); pmap_shootept(pmap, 1); mtx_leave(&pmap->pm_mtx); - pmap_tlb_shootwait(); + + while ((pve = free_pvs) != NULL) { + free_pvs = pve->pv_next; + pool_put(&pmap_pv_pool, pve); + } } void -pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) +pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa, struct pv_entry **free_pvs) { uint64_t l4idx, l3idx, l2idx, l1idx; - struct vm_page *pg3, *pg2, *pg1; + struct vm_page *pg, *pg3, *pg2, *pg1; + struct pv_entry *pve; paddr_t npa3, npa2, npa1; pd_entry_t *pd4, *pd3, *pd2, *pd1; pd_entry_t *pptes; + pt_entry_t opte; MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); @@ -2497,10 +2538,23 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1); pg1 = PHYS_TO_VM_PAGE(npa1); - if (pd1[l1idx] == 0) + opte = pmap_pte_set(&pd1[l1idx], 0); + if (opte == 0) /* XXX check using an EPT valid bit. */ return; - pd1[l1idx] = 0; + if (opte & PG_PVLIST) { + pg = PHYS_TO_VM_PAGE(opte & pg_frame); + if (pg == NULL) + panic("%s: PG_PVLIST mapping with unmanaged page: gpa " + "0x%lx, opte 0x%llx, hpa 0x%llx", __func__, gpa, + opte, opte & pg_frame); + pve = pmap_remove_pv(pg, pmap, gpa); + if (pve != NULL && free_pvs != NULL) { + pve->pv_next = *free_pvs; + *free_pvs = pve; + } + } + pg1->wire_count--; pmap->pm_stats.resident_count--; @@ -2535,11 +2589,14 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) } int -pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot) +pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot, + int flags) { uint64_t l4idx, l3idx, l2idx, l1idx; - pd_entry_t *pd, npte; - struct vm_page *ptp, *pptp; + pd_entry_t *pd; + pt_entry_t opte, npte; + struct vm_page *pg, *ptp, *pptp; + struct pv_entry *pve; paddr_t npa; struct uvm_object *obj; int ret = 0; @@ -2552,6 +2609,13 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ + pve = pool_get(&pmap_pv_pool, PR_NOWAIT); + if (pve == NULL) { + if (flags & PMAP_CANFAIL) + return ENOMEM; + panic("%s: no pv entries available", __func__); + } + mtx_enter(&pmap->pm_mtx); /* Start at PML4 / top level */ @@ -2699,11 +2763,21 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t if (prot & PROT_EXEC) npte |= EPT_X; - if (pd[l1idx] == 0) { + opte = pd[l1idx]; /* XXX check an EPT valid bit */ + if (opte == 0) { ptp->wire_count++; pmap->pm_stats.resident_count++; + + pg = PHYS_TO_VM_PAGE(hpa); + if (pg != NULL) { + pmap_enter_pv(pg, pve, pmap, gpa, ptp); + npte |= PG_PVLIST; + pve = NULL; + } else + panic("%s: cannot find vm page", __func__); } else { - /* XXX flush ept */ + pmap_shootept(pmap, 1); + pmap_tlb_shootwait(); } pd[l1idx] = npte; @@ -2711,6 +2785,9 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t unlock: mtx_leave(&pmap->pm_mtx); + if (pve != NULL) + pool_put(&pmap_pv_pool, pve); + return ret; } #endif /* NVMM > 0 */ @@ -2737,7 +2814,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, #if NVMM > 0 if (pmap_is_ept(pmap)) - return pmap_enter_ept(pmap, va, pa, prot); + return pmap_enter_ept(pmap, va, pa, prot, flags); #endif /* NVMM > 0 */ KASSERT(!(wc && nocache)); blob - 1f1bb8159498669fb74d00e8fc91f7a5691a90e4 blob + 22646ec8dbc70cd471fc6f0a46dfca39ef8ca418 --- sys/arch/amd64/include/pmap.h +++ sys/arch/amd64/include/pmap.h @@ -245,8 +245,8 @@ */ #define PG_W PG_AVAIL1 /* "wired" mapping */ -#define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */ -/* PG_AVAIL3 not used */ +/* PG_AVAIL2 not used */ +#define PG_PVLIST PG_AVAIL3 /* mapping has entry on pvlist */ /* * PCID assignments.