Index | Thread | Search

From:
Philip Guenther <guenther@gmail.com>
Subject:
Re: amd64 pmap & resident_count
To:
tech@openbsd.org
Date:
Sat, 24 Aug 2024 17:19:43 -0700

Download raw body.

Thread
ok guenther@

On Sat, Aug 24, 2024 at 6:28 AM Martin Pieuchot <mpi@openbsd.org> wrote:
>
> On 23/08/24(Fri) 23:14, Philip Guenther wrote:
> > On Fri, Aug 23, 2024 at 7:23 AM Martin Pieuchot <mpi@openbsd.org> wrote:
> > >
> > > On a 24 CPUs amd64 machine building a kernel with -j24 always result in
> > > a corrupted "RES" `pmap_kernel()->pm_stats.resident_count' view in top(1).
> > >
> > > Modifications to this field are (mostly) serialized by the pmap's mutex.
> > > Sadly pmap_enter() do not grab the mutex for the kernel's pmap.  Diff
> > > below changes that and fix the issue for me.  I believe it is easier and
> > > cheaper than using atomics everywhere.
> > >
> > > ok?
> >
> > I think I'm okay with this, but I think pmap_growkernel() needs to
> > grab the kernel pmap lock around the pmap_alloc_level() call, as that
> > calls pmap_get_physpage() which updates the resident_count.  Grabbing
> > the (kernel) pmap lock should probably replace the "s = splhigh(); /*
> > to be safe */" and corresponding splx(s) in pmap_growkernel(), which
> > are presumably assuming kernel-lock currently.  Or maybe the lock
> > should be held for the entire function?  Hmm
>
> I agree.  Here's a more complete version.  It also properly initialize
> the kernel pmap's mutex to IPL_VM, adds some asserts and document the
> locking.
>
> This has survived building clang with "make -j24" with a lot of forced
> swapping.
>
> Index: arch/amd64/amd64/pmap.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/pmap.c,v
> diff -u -p -r1.171 pmap.c
> --- arch/amd64/amd64/pmap.c     8 Aug 2024 15:57:22 -0000       1.171
> +++ arch/amd64/amd64/pmap.c     24 Aug 2024 11:35:36 -0000
> @@ -416,10 +416,6 @@ pmap_map_ptes(struct pmap *pmap)
>
>         KASSERT(pmap->pm_type != PMAP_TYPE_EPT);
>
> -       /* the kernel's pmap is always accessible */
> -       if (pmap == pmap_kernel())
> -               return 0;
> -
>         /*
>          * Lock the target map before switching to its page tables to
>          * guarantee other CPUs have finished changing the tables before
> @@ -427,6 +423,10 @@ pmap_map_ptes(struct pmap *pmap)
>          */
>         mtx_enter(&pmap->pm_mtx);
>
> +       /* the kernel's pmap is always accessible */
> +       if (pmap == pmap_kernel())
> +               return 0;
> +
>         cr3 = rcr3();
>         KASSERT((cr3 & CR3_PCID) == PCID_KERN ||
>                 (cr3 & CR3_PCID) == PCID_PROC);
> @@ -443,8 +443,7 @@ pmap_map_ptes(struct pmap *pmap)
>  void
>  pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
>  {
> -       if (pmap != pmap_kernel())
> -               mtx_leave(&pmap->pm_mtx);
> +       mtx_leave(&pmap->pm_mtx);
>
>         if (save_cr3 != 0)
>                 lcr3(save_cr3);
> @@ -706,8 +705,10 @@ pmap_bootstrap(paddr_t first_avail, padd
>          * user pmaps the pm_obj contains the list of active PTPs.
>          * the pm_obj currently does not have a pager.
>          */
> -
>         kpm = pmap_kernel();
> +
> +       mtx_init(&kpm->pm_mtx, IPL_VM);
> +
>         for (i = 0; i < PTP_LEVELS - 1; i++) {
>                 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1);
>                 kpm->pm_ptphint[i] = NULL;
> @@ -1124,6 +1125,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
>         int lidx = level - 1;
>         struct vm_page *pg;
>
> +       MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
>         if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
>             pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx]))
>                 return (pmap->pm_ptphint[lidx]);
> @@ -1140,6 +1143,8 @@ pmap_freepage(struct pmap *pmap, struct
>         int lidx;
>         struct uvm_object *obj;
>
> +       MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
>         lidx = level - 1;
>
>         obj = &pmap->pm_obj[lidx];
> @@ -1203,6 +1208,8 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t
>         ptp = NULL;
>         pa = (paddr_t)-1;
>
> +       MUTEX_ASSERT_LOCKED(&pmap->pm_mtx);
> +
>         /*
>          * Loop through all page table levels seeing if we need to
>          * add a new page to that level.
> @@ -3037,12 +3044,15 @@ vaddr_t
>  pmap_growkernel(vaddr_t maxkvaddr)
>  {
>         struct pmap *kpm = pmap_kernel(), *pm;
> -       int s, i;
>         unsigned newpdes;
>         long needed_kptp[PTP_LEVELS], target_nptp, old;
> +       int i;
>
> -       if (maxkvaddr <= pmap_maxkvaddr)
> +       mtx_enter(&kpm->pm_mtx);
> +       if (maxkvaddr <= pmap_maxkvaddr) {
> +               mtx_leave(&kpm->pm_mtx);
>                 return pmap_maxkvaddr;
> +       }
>
>         maxkvaddr = x86_round_pdr(maxkvaddr);
>         old = nkptp[PTP_LEVELS - 1];
> @@ -3062,7 +3072,6 @@ pmap_growkernel(vaddr_t maxkvaddr)
>         }
>
>
> -       s = splhigh();  /* to be safe */
>         pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
>
>         /*
> @@ -3080,7 +3089,7 @@ pmap_growkernel(vaddr_t maxkvaddr)
>                 mtx_leave(&pmaps_lock);
>         }
>         pmap_maxkvaddr = maxkvaddr;
> -       splx(s);
> +       mtx_leave(&kpm->pm_mtx);
>
>         return maxkvaddr;
>  }
> Index: arch/amd64/include/pmap.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/pmap.h,v
> diff -u -p -r1.89 pmap.h
> --- arch/amd64/include/pmap.h   9 Jul 2024 19:11:06 -0000       1.89
> +++ arch/amd64/include/pmap.h   24 Aug 2024 11:37:22 -0000
> @@ -287,6 +287,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
>   *
>   * note that the pm_obj contains the reference count,
>   * page list, and number of PTPs within the pmap.
> + *
> + * Locks used to protect struct members in this file:
> + *     P       pmaps_lock
> + *     p       this pmap's `pm_mtx'
> + *
>   */
>
>  #define PMAP_TYPE_NORMAL       1
> @@ -297,7 +302,7 @@ LIST_HEAD(pmap_head, pmap); /* struct pm
>  struct pmap {
>         struct mutex pm_mtx;
>         struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
> -       LIST_ENTRY(pmap) pm_list;       /* list (lck by pm_list lock) */
> +       LIST_ENTRY(pmap) pm_list;       /* [P] used by non-pmap_kernel() */
>         /*
>          * pm_pdir         : VA of page table to be used when executing in
>          *                   privileged mode
> @@ -312,11 +317,11 @@ struct pmap {
>         paddr_t pm_pdirpa, pm_pdirpa_intel;
>
>         struct vm_page *pm_ptphint[PTP_LEVELS-1];
> -                                       /* pointer to a PTP in our pmap */
> -       struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
> +                                       /* [p] pointer to a PTP in our pmap */
> +       struct pmap_statistics pm_stats;/* [p] pmap stats */
>
> -       int pm_type;                    /* Type of pmap this is (PMAP_TYPE_x) */
> -       uint64_t eptp;                  /* cached EPTP (used by vmm) */
> +       int pm_type;                    /* [p] Type of pmap (PMAP_TYPE_x) */
> +       uint64_t eptp;                  /* [p] cached EPTP (used by vmm) */
>  };
>
>  #define PMAP_EFI       PMAP_MD0
>