Download raw body.
amd64 pmap & resident_count
ok guenther@
On Sat, Aug 24, 2024 at 6:28 AM Martin Pieuchot <mpi@openbsd.org> wrote:
>
> On 23/08/24(Fri) 23:14, Philip Guenther wrote:
> > On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot <mpi@openbsd.org> wrote:
> > >
> > > On a 24 CPUs amd64 machine building a kernel with -j24 always result in
> > > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1).
> > >
> > > Modifications to this field are (mostly) serialized by the pmap's mutex.
> > > Sadly pmap_enter() do not grab the mutex for the kernel's pmap. Diff
> > > below changes that and fix the issue for me. I believe it is easier and
> > > cheaper than using atomics everywhere.
> > >
> > > ok?
> >
> > I think I'm okay with this, but I think pmap_growkernel() needs to
> > grab the kernel pmap lock around the pmap_alloc_level() call, as that
> > calls pmap_get_physpage() which updates the resident_count. Grabbing
> > the (kernel) pmap lock should probably replace the "s = splhigh(); /*
> > to be safe */" and corresponding splx(s) in pmap_growkernel(), which
> > are presumably assuming kernel-lock currently. Or maybe the lock
> > should be held for the entire function? Hmm
>
> I agree. Here's a more complete version. It also properly initialize
> the kernel pmap's mutex to IPL_VM, adds some asserts and document the
> locking.
>
> This has survived building clang with "make -j24" with a lot of forced
> swapping.
>
> Index: arch/amd64/amd64/pmap.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
> diff -u -p -r1.171 pmap.c
> --- arch/amd64/amd64/pmap.c 8 Aug 2024 15:57:22 -0000 1.171
> +++ arch/amd64/amd64/pmap.c 24 Aug 2024 11:35:36 -0000
> @@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap)
>
> KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
>
> - /* the kernel's pmap is always accessible */
> - if (pmap == pmap_kernel())
> - return 0;
> -
> /*
> * Lock the target map before switching to its page tables to
> * guarantee other CPUs have finished changing the tables before
> @@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap)
> */
> mtx_enter(&pmap->pm_mtx);
>
> + /* the kernel's pmap is always accessible */
> + if (pmap == pmap_kernel())
> + return 0;
> +
> cr3 = rcr3();
> KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
> (cr3 & CR3_PCID) == PCID_PROC);
> @@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap)
> void
> pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
> {
> - if (pmap != pmap_kernel())
> - mtx_leave(&pmap->pm_mtx);
> + mtx_leave(&pmap->pm_mtx);
>
> if (save_cr3 != 0)
> lcr3(save_cr3);
> @@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd
> * user pmaps the pm_obj contains the list of active PTPs.
> * the pm_obj currently does not have a pager.
> */
> -
> kpm = pmap_kernel();
> +
> + mtx_init(&kpm->pm_mtx, IPL_VM);
> +
> for (i = 0; i < PTP_LEVELS - 1; i++) {
> uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
> kpm->pm_ptphint[i] = NULL;
> @@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
> int lidx = level - 1;
> struct vm_page *pg;
>
> + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
> if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
> pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
> return (pmap->pm_ptphint[lidx]);
> @@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct
> int lidx;
> struct uvm_object *obj;
>
> + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
> lidx = level - 1;
>
> obj = &pmap->pm_obj[lidx];
> @@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
> ptp = NULL;
> pa = (paddr_t)-1;
>
> + MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
> /*
> * Loop through all page table levels seeing if we need to
> * add a new page to that level.
> @@ -3037,12 +3044,15 @@ vaddr_t
> pmap_growkernel(vaddr_t maxkvaddr)
> {
> struct pmap *kpm = pmap_kernel(), *pm;
> - int s, i;
> unsigned newpdes;
> long needed_kptp[PTP_LEVELS], target_nptp, old;
> + int i;
>
> - if (maxkvaddr <= pmap_maxkvaddr)
> + mtx_enter(&kpm->pm_mtx);
> + if (maxkvaddr <= pmap_maxkvaddr) {
> + mtx_leave(&kpm->pm_mtx);
> return pmap_maxkvaddr;
> + }
>
> maxkvaddr = x86_round_pdr(maxkvaddr);
> old = nkptp[PTP_LEVELS - 1];
> @@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr)
> }
>
>
> - s = splhigh(); /* to be safe */
> pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
>
> /*
> @@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr)
> mtx_leave(&pmaps_lock);
> }
> pmap_maxkvaddr = maxkvaddr;
> - splx(s);
> + mtx_leave(&kpm->pm_mtx);
>
> return maxkvaddr;
> }
> Index: arch/amd64/include/pmap.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
> diff -u -p -r1.89 pmap.h
> --- arch/amd64/include/pmap.h 9 Jul 2024 19:11:06 -0000 1.89
> +++ arch/amd64/include/pmap.h 24 Aug 2024 11:37:22 -0000
> @@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
> *
> * note that the pm_obj contains the reference count,
> * page list, and number of PTPs within the pmap.
> + *
> + * Locks used to protect struct members in this file:
> + * P pmaps_lock
> + * p this pmap's `pm_mtx'
> + *
> */
>
> #define PMAP_TYPE_NORMAL 1
> @@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
> struct pmap {
> struct mutex pm_mtx;
> struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
> - LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */
> + LIST_ENTRY(pmap) pm_list; /* [P] used by non-pmap_kernel() */
> /*
> * pm_pdir : VA of page table to be used when executing in
> * privileged mode
> @@ -312,11 +317,11 @@ struct pmap {
> paddr_t pm_pdirpa, pm_pdirpa_intel;
>
> struct vm_page *pm_ptphint[PTP_LEVELS-1];
> - /* pointer to a PTP in our pmap */
> - struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */
> + /* [p] pointer to a PTP in our pmap */
> + struct pmap_statistics pm_stats;/* [p] pmap stats */
>
> - int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */
> - uint64_t eptp; /* cached EPTP (used by vmm) */
> + int pm_type; /* [p] Type of pmap (PMAP_TYPE_x) */
> + uint64_t eptp; /* [p] cached EPTP (used by vmm) */
> };
>
> #define PMAP_EFI PMAP_MD0
>
amd64 pmap & resident_count