Index | Thread | Search

From:
Dave Voutila <dv@sisu.io>
Subject:
Re: teach EPT pmaps how to use pv_lists
To:
tech@openbsd.org
Cc:
Mike Larkin <mlarkin@openbsd.org>, Martin Pieuchot <mpi@openbsd.org>, Mischa <mischa@openbsd.amsterdam>
Date:
Sun, 08 Dec 2024 12:45:42 -0500

Download raw body.

Thread
Dave Voutila <dv@sisu.io> writes:

Updated diff below with changes suggested by mlarkin@. Testers and oks
needed :D

> Looking for some testing (of vmm/vmd on Intel hosts) and review of the
> following diff.
>
> In short, the objective is to incrementally bring EPT pmap operations
> more in line with regular amd64 and AMD RVI pmaps. This diffs:
>
> - updates pmap_enter_ept() to add pv_entry's when inserting a pte in an
>   EPT pmap
>
> - updates pmap_do_remove_ept() to support removing pv_entry's when
>   removing a pte from an EPT pmap
>
> - updates pmap_page_remove() to support zapping an EPT pte if a given
>   UVM vm_page is being removed and has a pv_entry pointing to an EPT pte
>
> For anyone reviewing:
>
> - I needed to change the PG_PVLIST bit so it would still be valid in an
>   EPT pte, so I just swapped for PG_AVAIL3 as it looks available for use
>   in EPT ptes
>
> - Validity checks on EPT ptes are not currently performed. For
>   correctness, one cannot just use the same PG_V bit as EPT ptes are
>   special and support true X-only permissions (in addition to breaking
>   out R from W). That should be a different diff IMO.
>
> I'm not a big fan of throwing more junk directly into
> pmap_page_remove(), but given how it's designed it feels like the most
> sane place to put the logic for now.
>
>

diffstat refs/heads/master refs/heads/ept-pvlist-3
 M  sys/arch/amd64/amd64/pmap.c    |  91+  14-
 M  sys/arch/amd64/include/pmap.h  |   2+   2-

2 files changed, 93 insertions(+), 16 deletions(-)

diff refs/heads/master refs/heads/ept-pvlist-3
commit - e2c061ecb9e807859d3525b9e868ae5d227b3878
commit + 4791f1f2f408c10c66cde3d074c062c2185f2a2a
blob - bfd667f99bf2dec1a468ca3d8d4f95bf34401026
blob + 01b8f6c2915c26427a9a1e8449fbaee452e5b17b
--- sys/arch/amd64/amd64/pmap.c
+++ sys/arch/amd64/amd64/pmap.c
@@ -336,8 +336,8 @@ struct pv_entry *pmap_remove_pv(struct vm_page *, stru
 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
 #if NVMM > 0
 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
-void pmap_do_remove_ept(struct pmap *, vaddr_t);
-int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
+void pmap_do_remove_ept(struct pmap *, vaddr_t, struct pv_entry **);
+int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t, int);
 void pmap_shootept(struct pmap *, int);
 #endif /* NVMM > 0 */
 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
@@ -1970,6 +1970,40 @@ pmap_page_remove(struct vm_page *pg)
 		pm = pve->pv_pmap;
 		mtx_leave(&pg->mdpage.pv_mtx);
 
+#if NVMM > 0
+		if (pm->pm_type == PMAP_TYPE_EPT) {
+			mtx_enter(&pm->pm_mtx);
+
+			/*
+			 * See note below about dropping pvlist lock.
+			 */
+			mtx_enter(&pg->mdpage.pv_mtx);
+			if ((pve = pg->mdpage.pv_list) == NULL ||
+			    pve->pv_pmap != pm) {
+				mtx_leave(&pg->mdpage.pv_mtx);
+				mtx_leave(&pm->pm_mtx);
+				pmap_destroy(pm);
+				mtx_enter(&pg->mdpage.pv_mtx);
+				continue;
+			}
+
+			/* Detach the pv_entry from the pv_list. */
+			pg->mdpage.pv_list = pve->pv_next;
+			mtx_leave(&pg->mdpage.pv_mtx);
+
+			/* Zap it. */
+			pmap_do_remove_ept(pm, pve->pv_va, NULL);
+			pmap_shootept(pm, 1);
+
+			/* Clean up before next iteration. */
+			mtx_leave(&pm->pm_mtx);
+			pmap_destroy(pm);
+			pool_put(&pmap_pv_pool, pve);
+			mtx_enter(&pg->mdpage.pv_mtx);
+			continue;
+		}
+#endif /* NVMM > 0 */
+
 		/* XXX use direct map? */
 		scr3 = pmap_map_ptes(pm);	/* locks pmap */
 		shootself = (scr3 == 0);
@@ -2439,29 +2473,36 @@ void
 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
 {
 	vaddr_t v;
+	struct pv_entry *pve, *free_pvs = NULL;
 
 	mtx_enter(&pmap->pm_mtx);
 
 	DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
 	    (uint64_t)egpa);
 	for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
-		pmap_do_remove_ept(pmap, v);
+		pmap_do_remove_ept(pmap, v, &free_pvs);
 
 	pmap_shootept(pmap, 1);
 
 	mtx_leave(&pmap->pm_mtx);
-
 	pmap_tlb_shootwait();
+
+	while ((pve = free_pvs) != NULL) {
+		free_pvs = pve->pv_next;
+		pool_put(&pmap_pv_pool, pve);
+	}
 }
 
 void
-pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
+pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa, struct pv_entry **free_pvs)
 {
 	uint64_t l4idx, l3idx, l2idx, l1idx;
-	struct vm_page *pg3, *pg2, *pg1;
+	struct vm_page *pg, *pg3, *pg2, *pg1;
+	struct pv_entry *pve;
 	paddr_t npa3, npa2, npa1;
 	pd_entry_t *pd4, *pd3, *pd2, *pd1;
 	pd_entry_t *pptes;
+	pt_entry_t opte;
 
 	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
 
@@ -2497,10 +2538,23 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
 	pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
 	pg1 = PHYS_TO_VM_PAGE(npa1);
 
-	if (pd1[l1idx] == 0)
+	opte = pmap_pte_set(&pd1[l1idx], 0);
+	if (opte == 0)	/* XXX check using an EPT valid bit. */
 		return;
 
-	pd1[l1idx] = 0;
+	if (opte & PG_PVLIST) {
+		pg = PHYS_TO_VM_PAGE(opte & pg_frame);
+		if (pg == NULL)
+			panic("%s: PG_PVLIST mapping with unmanaged page: gpa "
+			    "0x%lx, opte 0x%llx, hpa 0x%llx", __func__, gpa,
+			    opte, opte & pg_frame);
+		pve = pmap_remove_pv(pg, pmap, gpa);
+		if (pve != NULL && free_pvs != NULL) {
+			pve->pv_next = *free_pvs;
+			*free_pvs = pve;
+		}
+	}
+
 	pg1->wire_count--;
 	pmap->pm_stats.resident_count--;
 
@@ -2535,11 +2589,14 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
 }
 
 int
-pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
+pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot,
+    int flags)
 {
 	uint64_t l4idx, l3idx, l2idx, l1idx;
-	pd_entry_t *pd, npte;
-	struct vm_page *ptp, *pptp;
+	pd_entry_t *pd;
+	pt_entry_t opte, npte;
+	struct vm_page *pg, *ptp, *pptp;
+	struct pv_entry *pve;
 	paddr_t npa;
 	struct uvm_object *obj;
 	int ret = 0;
@@ -2552,6 +2609,13 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
 	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
 	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
 
+	pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
+	if (pve == NULL) {
+		if (flags & PMAP_CANFAIL)
+			return ENOMEM;
+		panic("%s: no pv entries available", __func__);
+	}
+
 	mtx_enter(&pmap->pm_mtx);
 
 	/* Start at PML4 / top level */
@@ -2699,11 +2763,21 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
 	if (prot & PROT_EXEC)
 		npte |= EPT_X;
 
-	if (pd[l1idx] == 0) {
+	opte = pd[l1idx];	/* XXX check an EPT valid bit */
+	if (opte == 0) {
 		ptp->wire_count++;
 		pmap->pm_stats.resident_count++;
+
+		pg = PHYS_TO_VM_PAGE(hpa);
+		if (pg != NULL) {
+			pmap_enter_pv(pg, pve, pmap, gpa, ptp);
+			npte |= PG_PVLIST;
+			pve = NULL;
+		} else
+			panic("%s: cannot find vm page", __func__);
 	} else {
-		/* XXX flush ept */
+		pmap_shootept(pmap, 1);
+		pmap_tlb_shootwait();
 	}
 
 	pd[l1idx] = npte;
@@ -2711,6 +2785,9 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
 unlock:
 	mtx_leave(&pmap->pm_mtx);
 
+	if (pve != NULL)
+		pool_put(&pmap_pv_pool, pve);
+
 	return ret;
 }
 #endif /* NVMM > 0 */
@@ -2737,7 +2814,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, 
 
 #if NVMM > 0
 	if (pmap_is_ept(pmap))
-		return pmap_enter_ept(pmap, va, pa, prot);
+		return pmap_enter_ept(pmap, va, pa, prot, flags);
 #endif /* NVMM > 0 */
 
 	KASSERT(!(wc && nocache));
blob - 1f1bb8159498669fb74d00e8fc91f7a5691a90e4
blob + 22646ec8dbc70cd471fc6f0a46dfca39ef8ca418
--- sys/arch/amd64/include/pmap.h
+++ sys/arch/amd64/include/pmap.h
@@ -245,8 +245,8 @@
  */
 
 #define PG_W		PG_AVAIL1	/* "wired" mapping */
-#define PG_PVLIST	PG_AVAIL2	/* mapping has entry on pvlist */
-/* PG_AVAIL3 not used */
+/* PG_AVAIL2 not used */
+#define PG_PVLIST	PG_AVAIL3	/* mapping has entry on pvlist */
 
 /*
  * PCID assignments.