From: Martin Pieuchot Subject: Re: amd64 pmap & resident_count To: Philip Guenther Cc: tech@openbsd.org Date: Sat, 24 Aug 2024 15:23:15 +0200 On 23/08/24(Fri) 23:14, Philip Guenther wrote: > On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot wrote: > > > > On a 24 CPUs amd64 machine building a kernel with -j24 always result in > > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1). > > > > Modifications to this field are (mostly) serialized by the pmap's mutex. > > Sadly pmap_enter() do not grab the mutex for the kernel's pmap. Diff > > below changes that and fix the issue for me. I believe it is easier and > > cheaper than using atomics everywhere. > > > > ok? > > I think I'm okay with this, but I think pmap_growkernel() needs to > grab the kernel pmap lock around the pmap_alloc_level() call, as that > calls pmap_get_physpage() which updates the resident_count. Grabbing > the (kernel) pmap lock should probably replace the "s = splhigh(); /* > to be safe */" and corresponding splx(s) in pmap_growkernel(), which > are presumably assuming kernel-lock currently. Or maybe the lock > should be held for the entire function? Hmm I agree. Here's a more complete version. It also properly initialize the kernel pmap's mutex to IPL_VM, adds some asserts and document the locking. This has survived building clang with "make -j24" with a lot of forced swapping. Index: arch/amd64/amd64/pmap.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v diff -u -p -r1.171 pmap.c --- arch/amd64/amd64/pmap.c 8 Aug 2024 15:57:22 -0000 1.171 +++ arch/amd64/amd64/pmap.c 24 Aug 2024 11:35:36 -0000 @@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap) KASSERT(pmap->pm_type != PMAP_TYPE_EPT); - /* the kernel's pmap is always accessible */ - if (pmap == pmap_kernel()) - return 0; - /* * Lock the target map before switching to its page tables to * guarantee other CPUs have finished changing the tables before @@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap) */ mtx_enter(&pmap->pm_mtx); + /* the kernel's pmap is always accessible */ + if (pmap == pmap_kernel()) + return 0; + cr3 = rcr3(); KASSERT((cr3 & CR3_PCID) == PCID_KERN || (cr3 & CR3_PCID) == PCID_PROC); @@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap) void pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) { - if (pmap != pmap_kernel()) - mtx_leave(&pmap->pm_mtx); + mtx_leave(&pmap->pm_mtx); if (save_cr3 != 0) lcr3(save_cr3); @@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd * user pmaps the pm_obj contains the list of active PTPs. * the pm_obj currently does not have a pager. */ - kpm = pmap_kernel(); + + mtx_init(&kpm->pm_mtx, IPL_VM); + for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1); kpm->pm_ptphint[i] = NULL; @@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t int lidx = level - 1; struct vm_page *pg; + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); + if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) return (pmap->pm_ptphint[lidx]); @@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct int lidx; struct uvm_object *obj; + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); + lidx = level - 1; obj = &pmap->pm_obj[lidx]; @@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t ptp = NULL; pa = (paddr_t)-1; + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); + /* * Loop through all page table levels seeing if we need to * add a new page to that level. @@ -3037,12 +3044,15 @@ vaddr_t pmap_growkernel(vaddr_t maxkvaddr) { struct pmap *kpm = pmap_kernel(), *pm; - int s, i; unsigned newpdes; long needed_kptp[PTP_LEVELS], target_nptp, old; + int i; - if (maxkvaddr <= pmap_maxkvaddr) + mtx_enter(&kpm->pm_mtx); + if (maxkvaddr <= pmap_maxkvaddr) { + mtx_leave(&kpm->pm_mtx); return pmap_maxkvaddr; + } maxkvaddr = x86_round_pdr(maxkvaddr); old = nkptp[PTP_LEVELS - 1]; @@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr) } - s = splhigh(); /* to be safe */ pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp); /* @@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr) mtx_leave(&pmaps_lock); } pmap_maxkvaddr = maxkvaddr; - splx(s); + mtx_leave(&kpm->pm_mtx); return maxkvaddr; } Index: arch/amd64/include/pmap.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v diff -u -p -r1.89 pmap.h --- arch/amd64/include/pmap.h 9 Jul 2024 19:11:06 -0000 1.89 +++ arch/amd64/include/pmap.h 24 Aug 2024 11:37:22 -0000 @@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm * * note that the pm_obj contains the reference count, * page list, and number of PTPs within the pmap. + * + * Locks used to protect struct members in this file: + * P pmaps_lock + * p this pmap's `pm_mtx' + * */ #define PMAP_TYPE_NORMAL 1 @@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm struct pmap { struct mutex pm_mtx; struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ - LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ + LIST_ENTRY(pmap) pm_list; /* [P] used by non-pmap_kernel() */ /* * pm_pdir : VA of page table to be used when executing in * privileged mode @@ -312,11 +317,11 @@ struct pmap { paddr_t pm_pdirpa, pm_pdirpa_intel; struct vm_page *pm_ptphint[PTP_LEVELS-1]; - /* pointer to a PTP in our pmap */ - struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ + /* [p] pointer to a PTP in our pmap */ + struct pmap_statistics pm_stats;/* [p] pmap stats */ - int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */ - uint64_t eptp; /* cached EPTP (used by vmm) */ + int pm_type; /* [p] Type of pmap (PMAP_TYPE_x) */ + uint64_t eptp; /* [p] cached EPTP (used by vmm) */ }; #define PMAP_EFI PMAP_MD0