Index | Thread | Search

From:
Martin Pieuchot <mpi@openbsd.org>
Subject:
Re: amd64 pmap & resident_count
To:
Philip Guenther <guenther@gmail.com>
Cc:
tech@openbsd.org
Date:
Sat, 24 Aug 2024 15:23:15 +0200

Download raw body.

Thread
On 23/08/24(Fri) 23:14, Philip Guenther wrote:
> On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot <mpi@openbsd.org> wrote:
> >
> > On a 24 CPUs amd64 machine building a kernel with -j24 always result in
> > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1).
> >
> > Modifications to this field are (mostly) serialized by the pmap's mutex.
> > Sadly pmap_enter() do not grab the mutex for the kernel's pmap.  Diff
> > below changes that and fix the issue for me.  I believe it is easier and
> > cheaper than using atomics everywhere.
> >
> > ok?
> 
> I think I'm okay with this, but I think pmap_growkernel() needs to
> grab the kernel pmap lock around the pmap_alloc_level() call, as that
> calls pmap_get_physpage() which updates the resident_count.  Grabbing
> the (kernel) pmap lock should probably replace the "s = splhigh(); /*
> to be safe */" and corresponding splx(s) in pmap_growkernel(), which
> are presumably assuming kernel-lock currently.  Or maybe the lock
> should be held for the entire function?  Hmm

I agree.  Here's a more complete version.  It also properly initialize
the kernel pmap's mutex to IPL_VM, adds some asserts and document the
locking.

This has survived building clang with "make -j24" with a lot of forced
swapping.

Index: arch/amd64/amd64/pmap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
diff -u -p -r1.171 pmap.c
--- arch/amd64/amd64/pmap.c	8 Aug 2024 15:57:22 -0000	1.171
+++ arch/amd64/amd64/pmap.c	24 Aug 2024 11:35:36 -0000
@@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap)
 
 	KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
 
-	/* the kernel's pmap is always accessible */
-	if (pmap == pmap_kernel())
-		return 0;
-
 	/*
 	 * Lock the target map before switching to its page tables to
 	 * guarantee other CPUs have finished changing the tables before
@@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap)
 	 */
 	mtx_enter(&pmap->pm_mtx);
 
+	/* the kernel's pmap is always accessible */
+	if (pmap == pmap_kernel())
+		return 0;
+
 	cr3 = rcr3();
 	KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
 		(cr3 & CR3_PCID) == PCID_PROC);
@@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap)
 void
 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
 {
-	if (pmap != pmap_kernel())
-		mtx_leave(&pmap->pm_mtx);
+	mtx_leave(&pmap->pm_mtx);
 
 	if (save_cr3 != 0)
 		lcr3(save_cr3);
@@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd
 	 * user pmaps the pm_obj contains the list of active PTPs.
 	 * the pm_obj currently does not have a pager.
 	 */
-
 	kpm = pmap_kernel();
+
+	mtx_init(&kpm->pm_mtx, IPL_VM);
+
 	for (i = 0; i < PTP_LEVELS - 1; i++) {
 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
 		kpm->pm_ptphint[i] = NULL;
@@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
 	int lidx = level - 1;
 	struct vm_page *pg;
 
+	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
 		return (pmap->pm_ptphint[lidx]);
@@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct 
 	int lidx;
 	struct uvm_object *obj;
 
+	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
 	lidx = level - 1;
 
 	obj = &pmap->pm_obj[lidx];
@@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
 	ptp = NULL;
 	pa = (paddr_t)-1;
 
+	MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
 	/*
 	 * Loop through all page table levels seeing if we need to
 	 * add a new page to that level.
@@ -3037,12 +3044,15 @@ vaddr_t
 pmap_growkernel(vaddr_t maxkvaddr)
 {
 	struct pmap *kpm = pmap_kernel(), *pm;
-	int s, i;
 	unsigned newpdes;
 	long needed_kptp[PTP_LEVELS], target_nptp, old;
+	int i;
 
-	if (maxkvaddr <= pmap_maxkvaddr)
+	mtx_enter(&kpm->pm_mtx);
+	if (maxkvaddr <= pmap_maxkvaddr) {
+		mtx_leave(&kpm->pm_mtx);
 		return pmap_maxkvaddr;
+	}
 
 	maxkvaddr = x86_round_pdr(maxkvaddr);
 	old = nkptp[PTP_LEVELS - 1];
@@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr)
 	}
 
 
-	s = splhigh();	/* to be safe */
 	pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
 
 	/*
@@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr)
 		mtx_leave(&pmaps_lock);
 	}
 	pmap_maxkvaddr = maxkvaddr;
-	splx(s);
+	mtx_leave(&kpm->pm_mtx);
 
 	return maxkvaddr;
 }
Index: arch/amd64/include/pmap.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
diff -u -p -r1.89 pmap.h
--- arch/amd64/include/pmap.h	9 Jul 2024 19:11:06 -0000	1.89
+++ arch/amd64/include/pmap.h	24 Aug 2024 11:37:22 -0000
@@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
  *
  * note that the pm_obj contains the reference count,
  * page list, and number of PTPs within the pmap.
+ *
+ * Locks used to protect struct members in this file:
+ *	P	pmaps_lock
+ *	p	this pmap's `pm_mtx'
+ *
  */
 
 #define PMAP_TYPE_NORMAL	1
@@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
 struct pmap {
 	struct mutex pm_mtx;
 	struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
-	LIST_ENTRY(pmap) pm_list;	/* list (lck by pm_list lock) */
+	LIST_ENTRY(pmap) pm_list;	/* [P] used by non-pmap_kernel() */
 	/*
 	 * pm_pdir         : VA of page table to be used when executing in
 	 *                   privileged mode
@@ -312,11 +317,11 @@ struct pmap {
 	paddr_t pm_pdirpa, pm_pdirpa_intel;
 
 	struct vm_page *pm_ptphint[PTP_LEVELS-1];
-					/* pointer to a PTP in our pmap */
-	struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
+					/* [p] pointer to a PTP in our pmap */
+	struct pmap_statistics pm_stats;/* [p] pmap stats */
 
-	int pm_type;			/* Type of pmap this is (PMAP_TYPE_x) */
-	uint64_t eptp;			/* cached EPTP (used by vmm) */
+	int pm_type;			/* [p] Type of pmap (PMAP_TYPE_x) */
+	uint64_t eptp;			/* [p] cached EPTP (used by vmm) */
 };
 
 #define PMAP_EFI	PMAP_MD0