Download raw body.
amd64 pmap & resident_count
On 23/08/24(Fri) 23:14, Philip Guenther wrote:
> On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot <mpi@openbsd.org> wrote:
> >
> > On a 24 CPUs amd64 machine building a kernel with -j24 always result in
> > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1).
> >
> > Modifications to this field are (mostly) serialized by the pmap's mutex.
> > Sadly pmap_enter() do not grab the mutex for the kernel's pmap. Diff
> > below changes that and fix the issue for me. I believe it is easier and
> > cheaper than using atomics everywhere.
> >
> > ok?
>
> I think I'm okay with this, but I think pmap_growkernel() needs to
> grab the kernel pmap lock around the pmap_alloc_level() call, as that
> calls pmap_get_physpage() which updates the resident_count. Grabbing
> the (kernel) pmap lock should probably replace the "s = splhigh(); /*
> to be safe */" and corresponding splx(s) in pmap_growkernel(), which
> are presumably assuming kernel-lock currently. Or maybe the lock
> should be held for the entire function? Hmm
I agree. Here's a more complete version. It also properly initialize
the kernel pmap's mutex to IPL_VM, adds some asserts and document the
locking.
This has survived building clang with "make -j24" with a lot of forced
swapping.
Index: arch/amd64/amd64/pmap.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
diff -u -p -r1.171 pmap.c
--- arch/amd64/amd64/pmap.c 8 Aug 2024 15:57:22 -0000 1.171
+++ arch/amd64/amd64/pmap.c 24 Aug 2024 11:35:36 -0000
@@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap)
KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
- /* the kernel's pmap is always accessible */
- if (pmap == pmap_kernel())
- return 0;
-
/*
* Lock the target map before switching to its page tables to
* guarantee other CPUs have finished changing the tables before
@@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap)
*/
mtx_enter(&pmap->pm_mtx);
+ /* the kernel's pmap is always accessible */
+ if (pmap == pmap_kernel())
+ return 0;
+
cr3 = rcr3();
KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
(cr3 & CR3_PCID) == PCID_PROC);
@@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap)
void
pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
{
- if (pmap != pmap_kernel())
- mtx_leave(&pmap->pm_mtx);
+ mtx_leave(&pmap->pm_mtx);
if (save_cr3 != 0)
lcr3(save_cr3);
@@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd
* user pmaps the pm_obj contains the list of active PTPs.
* the pm_obj currently does not have a pager.
*/
-
kpm = pmap_kernel();
+
+ mtx_init(&kpm->pm_mtx, IPL_VM);
+
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
kpm->pm_ptphint[i] = NULL;
@@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
int lidx = level - 1;
struct vm_page *pg;
+ MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
return (pmap->pm_ptphint[lidx]);
@@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct
int lidx;
struct uvm_object *obj;
+ MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
lidx = level - 1;
obj = &pmap->pm_obj[lidx];
@@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
ptp = NULL;
pa = (paddr_t)-1;
+ MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
+
/*
* Loop through all page table levels seeing if we need to
* add a new page to that level.
@@ -3037,12 +3044,15 @@ vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)
{
struct pmap *kpm = pmap_kernel(), *pm;
- int s, i;
unsigned newpdes;
long needed_kptp[PTP_LEVELS], target_nptp, old;
+ int i;
- if (maxkvaddr <= pmap_maxkvaddr)
+ mtx_enter(&kpm->pm_mtx);
+ if (maxkvaddr <= pmap_maxkvaddr) {
+ mtx_leave(&kpm->pm_mtx);
return pmap_maxkvaddr;
+ }
maxkvaddr = x86_round_pdr(maxkvaddr);
old = nkptp[PTP_LEVELS - 1];
@@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr)
}
- s = splhigh(); /* to be safe */
pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
/*
@@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr)
mtx_leave(&pmaps_lock);
}
pmap_maxkvaddr = maxkvaddr;
- splx(s);
+ mtx_leave(&kpm->pm_mtx);
return maxkvaddr;
}
Index: arch/amd64/include/pmap.h
===================================================================
RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
diff -u -p -r1.89 pmap.h
--- arch/amd64/include/pmap.h 9 Jul 2024 19:11:06 -0000 1.89
+++ arch/amd64/include/pmap.h 24 Aug 2024 11:37:22 -0000
@@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
*
* note that the pm_obj contains the reference count,
* page list, and number of PTPs within the pmap.
+ *
+ * Locks used to protect struct members in this file:
+ * P pmaps_lock
+ * p this pmap's `pm_mtx'
+ *
*/
#define PMAP_TYPE_NORMAL 1
@@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
struct pmap {
struct mutex pm_mtx;
struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
- LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
+ LIST_ENTRY(pmap) pm_list; /* [P] used by non-pmap_kernel() */
/*
* pm_pdir : VA of page table to be used when executing in
* privileged mode
@@ -312,11 +317,11 @@ struct pmap {
paddr_t pm_pdirpa, pm_pdirpa_intel;
struct vm_page *pm_ptphint[PTP_LEVELS-1];
- /* pointer to a PTP in our pmap */
- struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
+ /* [p] pointer to a PTP in our pmap */
+ struct pmap_statistics pm_stats;/* [p] pmap stats */
- int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */
- uint64_t eptp; /* cached EPTP (used by vmm) */
+ int pm_type; /* [p] Type of pmap (PMAP_TYPE_x) */
+ uint64_t eptp; /* [p] cached EPTP (used by vmm) */
};
#define PMAP_EFI PMAP_MD0
amd64 pmap & resident_count