From: Philip Guenther Subject: Re: amd64 pmap & resident_count To: tech@openbsd.org Date: Sat, 24 Aug 2024 17:19:43 -0700 ok guenther@ On Sat, Aug 24, 2024 at 6:28 AM Martin Pieuchot wrote: > > On 23/08/24(Fri) 23:14, Philip Guenther wrote: > > On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot wrote: > > > > > > On a 24 CPUs amd64 machine building a kernel with -j24 always result in > > > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1). > > > > > > Modifications to this field are (mostly) serialized by the pmap's mutex. > > > Sadly pmap_enter() do not grab the mutex for the kernel's pmap. Diff > > > below changes that and fix the issue for me. I believe it is easier and > > > cheaper than using atomics everywhere. > > > > > > ok? > > > > I think I'm okay with this, but I think pmap_growkernel() needs to > > grab the kernel pmap lock around the pmap_alloc_level() call, as that > > calls pmap_get_physpage() which updates the resident_count. Grabbing > > the (kernel) pmap lock should probably replace the "s = splhigh(); /* > > to be safe */" and corresponding splx(s) in pmap_growkernel(), which > > are presumably assuming kernel-lock currently. Or maybe the lock > > should be held for the entire function? Hmm > > I agree. Here's a more complete version. It also properly initialize > the kernel pmap's mutex to IPL_VM, adds some asserts and document the > locking. > > This has survived building clang with "make -j24" with a lot of forced > swapping. > > Index: arch/amd64/amd64/pmap.c > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v > diff -u -p -r1.171 pmap.c > --- arch/amd64/amd64/pmap.c 8 Aug 2024 15:57:22 -0000 1.171 > +++ arch/amd64/amd64/pmap.c 24 Aug 2024 11:35:36 -0000 > @@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap) > > KASSERT(pmap->pm_type != PMAP_TYPE_EPT); > > - /* the kernel's pmap is always accessible */ > - if (pmap == pmap_kernel()) > - return 0; > - > /* > * Lock the target map before switching to its page tables to > * guarantee other CPUs have finished changing the tables before > @@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap) > */ > mtx_enter(&pmap->pm_mtx); > > + /* the kernel's pmap is always accessible */ > + if (pmap == pmap_kernel()) > + return 0; > + > cr3 = rcr3(); > KASSERT((cr3 & CR3_PCID) == PCID_KERN || > (cr3 & CR3_PCID) == PCID_PROC); > @@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap) > void > pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) > { > - if (pmap != pmap_kernel()) > - mtx_leave(&pmap->pm_mtx); > + mtx_leave(&pmap->pm_mtx); > > if (save_cr3 != 0) > lcr3(save_cr3); > @@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd > * user pmaps the pm_obj contains the list of active PTPs. > * the pm_obj currently does not have a pager. > */ > - > kpm = pmap_kernel(); > + > + mtx_init(&kpm->pm_mtx, IPL_VM); > + > for (i = 0; i < PTP_LEVELS - 1; i++) { > uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1); > kpm->pm_ptphint[i] = NULL; > @@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t > int lidx = level - 1; > struct vm_page *pg; > > + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); > + > if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && > pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) > return (pmap->pm_ptphint[lidx]); > @@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct > int lidx; > struct uvm_object *obj; > > + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); > + > lidx = level - 1; > > obj = &pmap->pm_obj[lidx]; > @@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t > ptp = NULL; > pa = (paddr_t)-1; > > + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); > + > /* > * Loop through all page table levels seeing if we need to > * add a new page to that level. > @@ -3037,12 +3044,15 @@ vaddr_t > pmap_growkernel(vaddr_t maxkvaddr) > { > struct pmap *kpm = pmap_kernel(), *pm; > - int s, i; > unsigned newpdes; > long needed_kptp[PTP_LEVELS], target_nptp, old; > + int i; > > - if (maxkvaddr <= pmap_maxkvaddr) > + mtx_enter(&kpm->pm_mtx); > + if (maxkvaddr <= pmap_maxkvaddr) { > + mtx_leave(&kpm->pm_mtx); > return pmap_maxkvaddr; > + } > > maxkvaddr = x86_round_pdr(maxkvaddr); > old = nkptp[PTP_LEVELS - 1]; > @@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr) > } > > > - s = splhigh(); /* to be safe */ > pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp); > > /* > @@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr) > mtx_leave(&pmaps_lock); > } > pmap_maxkvaddr = maxkvaddr; > - splx(s); > + mtx_leave(&kpm->pm_mtx); > > return maxkvaddr; > } > Index: arch/amd64/include/pmap.h > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v > diff -u -p -r1.89 pmap.h > --- arch/amd64/include/pmap.h 9 Jul 2024 19:11:06 -0000 1.89 > +++ arch/amd64/include/pmap.h 24 Aug 2024 11:37:22 -0000 > @@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm > * > * note that the pm_obj contains the reference count, > * page list, and number of PTPs within the pmap. > + * > + * Locks used to protect struct members in this file: > + * P pmaps_lock > + * p this pmap's `pm_mtx' > + * > */ > > #define PMAP_TYPE_NORMAL 1 > @@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm > struct pmap { > struct mutex pm_mtx; > struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ > - LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ > + LIST_ENTRY(pmap) pm_list; /* [P] used by non-pmap_kernel() */ > /* > * pm_pdir : VA of page table to be used when executing in > * privileged mode > @@ -312,11 +317,11 @@ struct pmap { > paddr_t pm_pdirpa, pm_pdirpa_intel; > > struct vm_page *pm_ptphint[PTP_LEVELS-1]; > - /* pointer to a PTP in our pmap */ > - struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ > + /* [p] pointer to a PTP in our pmap */ > + struct pmap_statistics pm_stats;/* [p] pmap stats */ > > - int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */ > - uint64_t eptp; /* cached EPTP (used by vmm) */ > + int pm_type; /* [p] Type of pmap (PMAP_TYPE_x) */ > + uint64_t eptp; /* [p] cached EPTP (used by vmm) */ > }; > > #define PMAP_EFI PMAP_MD0 >