From: Dave Voutila Subject: Re: teach EPT pmaps how to use pv_lists To: tech@openbsd.org Cc: Mike Larkin Date: Sun, 13 Oct 2024 10:21:09 -0400 Dave Voutila writes: > Looking for some testing (of vmm/vmd on Intel hosts) and review of the > following diff. > Still looking for a test or two before I push for a review. (Though a review would be nice as well :) > In short, the objective is to incrementally bring EPT pmap operations > more in line with regular amd64 and AMD RVI pmaps. This diffs: > > - updates pmap_enter_ept() to add pv_entry's when inserting a pte in an > EPT pmap > > - updates pmap_do_remove_ept() to support removing pv_entry's when > removing a pte from an EPT pmap > > - updates pmap_page_remove() to support zapping an EPT pte if a given > UVM vm_page is being removed and has a pv_entry pointing to an EPT pte > > For anyone reviewing: > > - I needed to change the PG_PVLIST bit so it would still be valid in an > EPT pte, so I just swapped for PG_AVAIL3 as it looks available for use > in EPT ptes > > - Validity checks on EPT ptes are not currently performed. For > correctness, one cannot just use the same PG_V bit as EPT ptes are > special and support true X-only permissions (in addition to breaking > out R from W). That should be a different diff IMO. > > I'm not a big fan of throwing more junk directly into > pmap_page_remove(), but given how it's designed it feels like the most > sane place to put the logic for now. > > > diffstat refs/heads/master refs/heads/ept-pvlist-3 > M sys/arch/amd64/amd64/pmap.c | 90+ 14- > M sys/arch/amd64/include/pmap.h | 2+ 2- > > 2 files changed, 92 insertions(+), 16 deletions(-) > > diff refs/heads/master refs/heads/ept-pvlist-3 > commit - 3c9ccb50785e1c693c25fd18c467a46499c73493 > commit + 1fd874b9a12aadcaf92afdc5c9caa94bb5970c27 > blob - 35af9d10cdc6cb79f6288b964bc350a46bfce8d6 > blob + 4e843b19983e2a3c277c8cc9aea8165746c80aea > --- sys/arch/amd64/amd64/pmap.c > +++ sys/arch/amd64/amd64/pmap.c > @@ -336,8 +336,8 @@ struct pv_entry *pmap_remove_pv(struct vm_page *, stru > void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); > #if NVMM > 0 > void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); > -void pmap_do_remove_ept(struct pmap *, vaddr_t); > -int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); > +void pmap_do_remove_ept(struct pmap *, vaddr_t, struct pv_entry **); > +int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t, int); > void pmap_shootept(struct pmap *, int); > #endif /* NVMM > 0 */ > int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, > @@ -1970,6 +1970,39 @@ pmap_page_remove(struct vm_page *pg) > pm = pve->pv_pmap; > mtx_leave(&pg->mdpage.pv_mtx); > > +#if NVMM > 0 > + if (pm->pm_type == PMAP_TYPE_EPT) { > + mtx_enter(&pm->pm_mtx); > + > + /* > + * See note below about dropping pvlist lock. > + */ > + mtx_enter(&pg->mdpage.pv_mtx); > + if ((pve = pg->mdpage.pv_list) == NULL || > + pve->pv_pmap != pm) { > + mtx_leave(&pg->mdpage.pv_mtx); > + mtx_leave(&pm->pm_mtx); > + pmap_destroy(pm); > + mtx_enter(&pg->mdpage.pv_mtx); > + } > + > + /* Detach the pv_entry from the pv_list. */ > + pg->mdpage.pv_list = pve->pv_next; > + mtx_leave(&pg->mdpage.pv_mtx); > + > + /* Zap it. */ > + pmap_do_remove_ept(pm, pve->pv_va, NULL); > + pmap_shootept(pm, 1); > + > + /* Clean up before next iteration. */ > + mtx_leave(&pm->pm_mtx); > + pmap_destroy(pm); > + pool_put(&pmap_pv_pool, pve); > + mtx_enter(&pg->mdpage.pv_mtx); > + continue; > + } > +#endif /* NVMM > 0 */ > + > /* XXX use direct map? */ > scr3 = pmap_map_ptes(pm); /* locks pmap */ > shootself = (scr3 == 0); > @@ -2460,29 +2493,36 @@ void > pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) > { > vaddr_t v; > + struct pv_entry *pve, *free_pvs = NULL; > > mtx_enter(&pmap->pm_mtx); > > DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa, > (uint64_t)egpa); > for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) > - pmap_do_remove_ept(pmap, v); > + pmap_do_remove_ept(pmap, v, &free_pvs); > > pmap_shootept(pmap, 1); > > mtx_leave(&pmap->pm_mtx); > - > pmap_tlb_shootwait(); > + > + while ((pve = free_pvs) != NULL) { > + free_pvs = pve->pv_next; > + pool_put(&pmap_pv_pool, pve); > + } > } > > void > -pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) > +pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa, struct pv_entry **free_pvs) > { > uint64_t l4idx, l3idx, l2idx, l1idx; > - struct vm_page *pg3, *pg2, *pg1; > + struct vm_page *pg, *pg3, *pg2, *pg1; > + struct pv_entry *pve; > paddr_t npa3, npa2, npa1; > pd_entry_t *pd4, *pd3, *pd2, *pd1; > pd_entry_t *pptes; > + pt_entry_t opte; > > MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); > > @@ -2518,10 +2558,23 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) > pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1); > pg1 = PHYS_TO_VM_PAGE(npa1); > > - if (pd1[l1idx] == 0) > + opte = pmap_pte_set(&pd1[l1idx], 0); > + if (opte == 0) // XXX check using an EPT valid bit. > return; > > - pd1[l1idx] = 0; > + if (opte & PG_PVLIST) { > + pg = PHYS_TO_VM_PAGE(opte & pg_frame); > + if (pg == NULL) > + panic("%s: PG_PVLIST mapping with unmanaged page: gpa " > + "0x%lx, opte 0x%llx, hpa 0x%llx", __func__, gpa, > + opte, opte & pg_frame); > + pve = pmap_remove_pv(pg, pmap, gpa); > + if (pve != NULL && free_pvs != NULL) { > + pve->pv_next = *free_pvs; > + *free_pvs = pve; > + } > + } > + > pg1->wire_count--; > pmap->pm_stats.resident_count--; > > @@ -2556,11 +2609,14 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) > } > > int > -pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot) > +pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot, > + int flags) > { > uint64_t l4idx, l3idx, l2idx, l1idx; > - pd_entry_t *pd, npte; > - struct vm_page *ptp, *pptp; > + pd_entry_t *pd; > + pt_entry_t opte, npte; > + struct vm_page *pg, *ptp, *pptp; > + struct pv_entry *pve; > paddr_t npa; > struct uvm_object *obj; > int ret = 0; > @@ -2573,6 +2629,13 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t > l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ > l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ > > + pve = pool_get(&pmap_pv_pool, PR_NOWAIT); > + if (pve == NULL) { > + if (flags & PMAP_CANFAIL) > + return ENOMEM; > + panic("%s: no pv entries available", __func__); > + } > + > mtx_enter(&pmap->pm_mtx); > > /* Start at PML4 / top level */ > @@ -2720,11 +2783,21 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t > if (prot & PROT_EXEC) > npte |= EPT_X; > > - if (pd[l1idx] == 0) { > + opte = pd[l1idx]; // XXX check an EPT valid bit > + if (opte == 0) { > ptp->wire_count++; > pmap->pm_stats.resident_count++; > + > + pg = PHYS_TO_VM_PAGE(hpa); > + if (pg != NULL) { > + pmap_enter_pv(pg, pve, pmap, gpa, ptp); > + npte |= PG_PVLIST; > + pve = NULL; > + } else > + panic("%s: cannot find vm page", __func__); > } else { > - /* XXX flush ept */ > + pmap_shootept(pmap, 1); > + pmap_tlb_shootwait(); > } > > pd[l1idx] = npte; > @@ -2732,6 +2805,9 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t > unlock: > mtx_leave(&pmap->pm_mtx); > > + if (pve != NULL) > + pool_put(&pmap_pv_pool, pve); > + > return ret; > } > #endif /* NVMM > 0 */ > @@ -2758,7 +2834,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, > > #if NVMM > 0 > if (pmap_is_ept(pmap)) > - return pmap_enter_ept(pmap, va, pa, prot); > + return pmap_enter_ept(pmap, va, pa, prot, flags); > #endif /* NVMM > 0 */ > > KASSERT(!(wc && nocache)); > blob - cc358eab1e93376f4d0b511d0c0bcd06aa476b40 > blob + 6eeab997123e6d10a9dddde3f55bf25950f43978 > --- sys/arch/amd64/include/pmap.h > +++ sys/arch/amd64/include/pmap.h > @@ -245,8 +245,8 @@ > */ > > #define PG_W PG_AVAIL1 /* "wired" mapping */ > -#define PG_PVLIST PG_AVAIL2 /* mapping has entry on pvlist */ > -/* PG_AVAIL3 not used */ > +/* PG_AVAIL2 not used */ > +#define PG_PVLIST PG_AVAIL3 /* mapping has entry on pvlist */ > > /* > * PCID assignments.