Index | Thread | Search

From:
Dave Voutila <dv@sisu.io>
Subject:
Re: teach EPT pmaps how to use pv_lists
To:
tech@openbsd.org
Cc:
Mike Larkin <mlarkin@openbsd.org>
Date:
Sun, 13 Oct 2024 10:21:09 -0400

Download raw body.

Thread
Dave Voutila <dv@sisu.io> writes:

> Looking for some testing (of vmm/vmd on Intel hosts) and review of the
> following diff.
>

Still looking for a test or two before I push for a review. (Though a
review would be nice as well :)

> In short, the objective is to incrementally bring EPT pmap operations
> more in line with regular amd64 and AMD RVI pmaps. This diffs:
>
> - updates pmap_enter_ept() to add pv_entry's when inserting a pte in an
>   EPT pmap
>
> - updates pmap_do_remove_ept() to support removing pv_entry's when
>   removing a pte from an EPT pmap
>
> - updates pmap_page_remove() to support zapping an EPT pte if a given
>   UVM vm_page is being removed and has a pv_entry pointing to an EPT pte
>
> For anyone reviewing:
>
> - I needed to change the PG_PVLIST bit so it would still be valid in an
>   EPT pte, so I just swapped for PG_AVAIL3 as it looks available for use
>   in EPT ptes
>
> - Validity checks on EPT ptes are not currently performed. For
>   correctness, one cannot just use the same PG_V bit as EPT ptes are
>   special and support true X-only permissions (in addition to breaking
>   out R from W). That should be a different diff IMO.
>
> I'm not a big fan of throwing more junk directly into
> pmap_page_remove(), but given how it's designed it feels like the most
> sane place to put the logic for now.
>
>
> diffstat refs/heads/master refs/heads/ept-pvlist-3
>  M  sys/arch/amd64/amd64/pmap.c    |  90+  14-
>  M  sys/arch/amd64/include/pmap.h  |   2+   2-
>
> 2 files changed, 92 insertions(+), 16 deletions(-)
>
> diff refs/heads/master refs/heads/ept-pvlist-3
> commit - 3c9ccb50785e1c693c25fd18c467a46499c73493
> commit + 1fd874b9a12aadcaf92afdc5c9caa94bb5970c27
> blob - 35af9d10cdc6cb79f6288b964bc350a46bfce8d6
> blob + 4e843b19983e2a3c277c8cc9aea8165746c80aea
> --- sys/arch/amd64/amd64/pmap.c
> +++ sys/arch/amd64/amd64/pmap.c
> @@ -336,8 +336,8 @@ struct pv_entry *pmap_remove_pv(struct vm_page *, stru
>  void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
>  #if NVMM > 0
>  void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t);
> -void pmap_do_remove_ept(struct pmap *, vaddr_t);
> -int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t);
> +void pmap_do_remove_ept(struct pmap *, vaddr_t, struct pv_entry **);
> +int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t, int);
>  void pmap_shootept(struct pmap *, int);
>  #endif /* NVMM > 0 */
>  int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
> @@ -1970,6 +1970,39 @@ pmap_page_remove(struct vm_page *pg)
>  		pm = pve->pv_pmap;
>  		mtx_leave(&pg->mdpage.pv_mtx);
>
> +#if NVMM > 0
> +		if (pm->pm_type == PMAP_TYPE_EPT) {
> +			mtx_enter(&pm->pm_mtx);
> +
> +			/*
> +			 * See note below about dropping pvlist lock.
> +			 */
> +			mtx_enter(&pg->mdpage.pv_mtx);
> +			if ((pve = pg->mdpage.pv_list) == NULL ||
> +			    pve->pv_pmap != pm) {
> +				mtx_leave(&pg->mdpage.pv_mtx);
> +				mtx_leave(&pm->pm_mtx);
> +				pmap_destroy(pm);
> +				mtx_enter(&pg->mdpage.pv_mtx);
> +			}
> +
> +			/* Detach the pv_entry from the pv_list. */
> +			pg->mdpage.pv_list = pve->pv_next;
> +			mtx_leave(&pg->mdpage.pv_mtx);
> +
> +			/* Zap it. */
> +			pmap_do_remove_ept(pm, pve->pv_va, NULL);
> +			pmap_shootept(pm, 1);
> +
> +			/* Clean up before next iteration. */
> +			mtx_leave(&pm->pm_mtx);
> +			pmap_destroy(pm);
> +			pool_put(&pmap_pv_pool, pve);
> +			mtx_enter(&pg->mdpage.pv_mtx);
> +			continue;
> +		}
> +#endif /* NVMM > 0 */
> +
>  		/* XXX use direct map? */
>  		scr3 = pmap_map_ptes(pm);	/* locks pmap */
>  		shootself = (scr3 == 0);
> @@ -2460,29 +2493,36 @@ void
>  pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa)
>  {
>  	vaddr_t v;
> +	struct pv_entry *pve, *free_pvs = NULL;
>
>  	mtx_enter(&pmap->pm_mtx);
>
>  	DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa,
>  	    (uint64_t)egpa);
>  	for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE)
> -		pmap_do_remove_ept(pmap, v);
> +		pmap_do_remove_ept(pmap, v, &free_pvs);
>
>  	pmap_shootept(pmap, 1);
>
>  	mtx_leave(&pmap->pm_mtx);
> -
>  	pmap_tlb_shootwait();
> +
> +	while ((pve = free_pvs) != NULL) {
> +		free_pvs = pve->pv_next;
> +		pool_put(&pmap_pv_pool, pve);
> +	}
>  }
>
>  void
> -pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
> +pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa, struct pv_entry **free_pvs)
>  {
>  	uint64_t l4idx, l3idx, l2idx, l1idx;
> -	struct vm_page *pg3, *pg2, *pg1;
> +	struct vm_page *pg, *pg3, *pg2, *pg1;
> +	struct pv_entry *pve;
>  	paddr_t npa3, npa2, npa1;
>  	pd_entry_t *pd4, *pd3, *pd2, *pd1;
>  	pd_entry_t *pptes;
> +	pt_entry_t opte;
>
>  	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
>
> @@ -2518,10 +2558,23 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
>  	pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1);
>  	pg1 = PHYS_TO_VM_PAGE(npa1);
>
> -	if (pd1[l1idx] == 0)
> +	opte = pmap_pte_set(&pd1[l1idx], 0);
> +	if (opte == 0)	// XXX check using an EPT valid bit.
>  		return;
>
> -	pd1[l1idx] = 0;
> +	if (opte & PG_PVLIST) {
> +		pg = PHYS_TO_VM_PAGE(opte & pg_frame);
> +		if (pg == NULL)
> +			panic("%s: PG_PVLIST mapping with unmanaged page: gpa "
> +			    "0x%lx, opte 0x%llx, hpa 0x%llx", __func__, gpa,
> +			    opte, opte & pg_frame);
> +		pve = pmap_remove_pv(pg, pmap, gpa);
> +		if (pve != NULL && free_pvs != NULL) {
> +			pve->pv_next = *free_pvs;
> +			*free_pvs = pve;
> +		}
> +	}
> +
>  	pg1->wire_count--;
>  	pmap->pm_stats.resident_count--;
>
> @@ -2556,11 +2609,14 @@ pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa)
>  }
>
>  int
> -pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot)
> +pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot,
> +    int flags)
>  {
>  	uint64_t l4idx, l3idx, l2idx, l1idx;
> -	pd_entry_t *pd, npte;
> -	struct vm_page *ptp, *pptp;
> +	pd_entry_t *pd;
> +	pt_entry_t opte, npte;
> +	struct vm_page *pg, *ptp, *pptp;
> +	struct pv_entry *pve;
>  	paddr_t npa;
>  	struct uvm_object *obj;
>  	int ret = 0;
> @@ -2573,6 +2629,13 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
>  	l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */
>  	l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */
>
> +	pve = pool_get(&pmap_pv_pool, PR_NOWAIT);
> +	if (pve == NULL) {
> +		if (flags & PMAP_CANFAIL)
> +			return ENOMEM;
> +		panic("%s: no pv entries available", __func__);
> +	}
> +
>  	mtx_enter(&pmap->pm_mtx);
>
>  	/* Start at PML4 / top level */
> @@ -2720,11 +2783,21 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
>  	if (prot & PROT_EXEC)
>  		npte |= EPT_X;
>
> -	if (pd[l1idx] == 0) {
> +	opte = pd[l1idx];	// XXX check an EPT valid bit
> +	if (opte == 0) {
>  		ptp->wire_count++;
>  		pmap->pm_stats.resident_count++;
> +
> +		pg = PHYS_TO_VM_PAGE(hpa);
> +		if (pg != NULL) {
> +			pmap_enter_pv(pg, pve, pmap, gpa, ptp);
> +			npte |= PG_PVLIST;
> +			pve = NULL;
> +		} else
> +			panic("%s: cannot find vm page", __func__);
>  	} else {
> -		/* XXX flush ept */
> +		pmap_shootept(pmap, 1);
> +		pmap_tlb_shootwait();
>  	}
>
>  	pd[l1idx] = npte;
> @@ -2732,6 +2805,9 @@ pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t
>  unlock:
>  	mtx_leave(&pmap->pm_mtx);
>
> +	if (pve != NULL)
> +		pool_put(&pmap_pv_pool, pve);
> +
>  	return ret;
>  }
>  #endif /* NVMM > 0 */
> @@ -2758,7 +2834,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa,
>
>  #if NVMM > 0
>  	if (pmap_is_ept(pmap))
> -		return pmap_enter_ept(pmap, va, pa, prot);
> +		return pmap_enter_ept(pmap, va, pa, prot, flags);
>  #endif /* NVMM > 0 */
>
>  	KASSERT(!(wc && nocache));
> blob - cc358eab1e93376f4d0b511d0c0bcd06aa476b40
> blob + 6eeab997123e6d10a9dddde3f55bf25950f43978
> --- sys/arch/amd64/include/pmap.h
> +++ sys/arch/amd64/include/pmap.h
> @@ -245,8 +245,8 @@
>   */
>
>  #define PG_W		PG_AVAIL1	/* "wired" mapping */
> -#define PG_PVLIST	PG_AVAIL2	/* mapping has entry on pvlist */
> -/* PG_AVAIL3 not used */
> +/* PG_AVAIL2 not used */
> +#define PG_PVLIST	PG_AVAIL3	/* mapping has entry on pvlist */
>
>  /*
>   * PCID assignments.