Download raw body.
Push UVM's pageqlock inside uvm_pagefree()
> Date: Thu, 20 Feb 2025 16:21:03 +0100
> From: Martin Pieuchot <mpi@grenadille.net>
>
> Hey Mark,
>
> Thanks for the review.
>
> On 19/02/25(Wed) 15:08, Mark Kettenis wrote:
> > > Date: Wed, 19 Feb 2025 13:11:09 +0100
> > > From: Martin Pieuchot <mpi@grenadille.net>
> > >
> > > Here's the first step to reduce the contention on the global pageqlock
> > > mutex. The lock is necessary to guarantee the integrity of the global
> > > LRUs. So it is only required around uvm_pagedequeue().
> > >
> > > This diff already makes a noticeable difference. ok?
> >
> > Possibly not ok.
> >
> > Is it possible that uvm_pageclean() gets called simulatinously by two
> > different CPUs for the same page? I'm not 100% sure that isn't
> > possible.
>
> If the page is associated with an `anon' or `uobject' we already assert
> that the, so called, owner lock, is held. So it is not possible.
>
> > But if you can convince me that that isn't possible, there
> > are still a few issues:
> > are still a few issues:
> >
> > 1. Probably should remove the
> >
> > => caller must lock page queues if `pg' is managed
> >
> > bit from the comment.
>
> Indeed.
>
> > 2. I think this means that uvmexp.wired-- now happens without holding
> > the uvmexp.pageqlock. So that counter may go off and that in turn
> > may cause problems in the pagedaemon. Maybe we need to use atomic
> > ops for it?
>
> I agree, we do.
>
> > 3. I think you're removing an optimization in uvn_flush(). That
> > function would accumulate cleaned pages and return them to
> > pmemrange in one go. That potentially avoids lots of unlock/relock
> > cases.
>
> This is now done via per-CPU magazines. I committed a similar change to
> uvm_anfree(). In my measurements it decreases contention on the fpageq
> lock because we do not replenish a magazine, and give back pages to the
> pmemrange allocator, every time uvn_flush() is called.
Ah, yes, that probably makes the optimization mostly pointless.
> > 4. The current code has some cases that do:
> >
> > uvm_lock_pageq();
> > pmap_page_protect(pg, PROT_NONE);
> > uvm_pagefree(pg);
> > uvm_unlock_pageq();
> >
> > I'm wondering if that is trying to stop the case where we remove
> > the page from the pmap, but another CPU tries to access the page
> > and fault it back in. So what prevents that from happening once
> > you no longer grab the lock?
>
> I like your question.
>
> Note that currently the pageq lock doesn't prevent another CPU to fault
> between the PROT_NONE and uvm_pagefree(). When a fault occurs, the VA
> is translated into an `anon' which might or might not be associated to a
> physical page.
> In the fault handler accessing `anon->an_page' is done while holding the
> corresponding anon lock which is the lock of the amap it belongs to.
>
> In other words, what prevents `pg' to be faulted back in is the anon
> lock which is held during uvm_pageclean().
>
> The same logic applies to managed pages linked to an uobj except that the
> lock is `vmobjlock'.
>
> Here's an updated diff with the comment removed and using atomic
> operations for uvmexp.wired.
>
> ok?
ok kettenis@
>
> Index: uvm/uvm_anon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
> diff -u -p -r1.61 uvm_anon.c
> --- uvm/uvm_anon.c 27 Dec 2024 12:04:40 -0000 1.61
> +++ uvm/uvm_anon.c 19 Feb 2025 11:12:50 -0000
> @@ -106,14 +106,10 @@ uvm_anfree_list(struct vm_anon *anon, st
> * clean page, and put it on pglist
> * for later freeing.
> */
> - uvm_lock_pageq();
> uvm_pageclean(pg);
> - uvm_unlock_pageq();
> TAILQ_INSERT_HEAD(pgl, pg, pageq);
> } else {
> - uvm_lock_pageq(); /* lock out pagedaemon */
> uvm_pagefree(pg); /* bye bye */
> - uvm_unlock_pageq(); /* free the daemon */
> }
> } else {
> if (anon->an_swslot != 0 && anon->an_swslot != SWSLOT_BAD) {
> @@ -249,10 +245,8 @@ uvm_anon_release(struct vm_anon *anon)
> KASSERT(pg->uanon == anon);
> KASSERT(anon->an_ref == 0);
>
> - uvm_lock_pageq();
> pmap_page_protect(pg, PROT_NONE);
> uvm_pagefree(pg);
> - uvm_unlock_pageq();
> KASSERT(anon->an_page == NULL);
> lock = anon->an_lock;
> uvm_anon_dropswap(anon);
> Index: uvm/uvm_aobj.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
> diff -u -p -r1.115 uvm_aobj.c
> --- uvm/uvm_aobj.c 27 Dec 2024 12:04:40 -0000 1.115
> +++ uvm/uvm_aobj.c 19 Feb 2025 11:12:50 -0000
> @@ -839,9 +839,7 @@ uao_detach(struct uvm_object *uobj)
> continue;
> }
> uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
> - uvm_lock_pageq();
> uvm_pagefree(pg);
> - uvm_unlock_pageq();
> }
>
> /*
> @@ -957,10 +955,7 @@ uao_flush(struct uvm_object *uobj, voff_
> * because we need to update swap accounting anyway.
> */
> uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
> - uvm_lock_pageq();
> uvm_pagefree(pg);
> - uvm_unlock_pageq();
> -
> continue;
> default:
> panic("uao_flush: weird flags");
> @@ -1179,9 +1174,7 @@ uao_get(struct uvm_object *uobj, voff_t
> atomic_clearbits_int(&ptmp->pg_flags,
> PG_WANTED|PG_BUSY);
> UVM_PAGE_OWN(ptmp, NULL);
> - uvm_lock_pageq();
> uvm_pagefree(ptmp);
> - uvm_unlock_pageq();
> rw_exit(uobj->vmobjlock);
>
> return rv;
> Index: uvm/uvm_fault.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
> diff -u -p -r1.163 uvm_fault.c
> --- uvm/uvm_fault.c 29 Jan 2025 15:22:33 -0000 1.163
> +++ uvm/uvm_fault.c 19 Feb 2025 11:12:50 -0000
> @@ -417,9 +417,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
> * cannot be mapped and thus no need to
> * pmap_page_protect() it.
> */
> - uvm_lock_pageq();
> uvm_pagefree(pg);
> - uvm_unlock_pageq();
>
> if (locked) {
> uvmfault_unlockall(ufi, NULL, NULL);
> Index: uvm/uvm_km.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_km.c,v
> diff -u -p -r1.155 uvm_km.c
> --- uvm/uvm_km.c 1 Nov 2024 20:26:18 -0000 1.155
> +++ uvm/uvm_km.c 19 Feb 2025 11:12:50 -0000
> @@ -270,9 +270,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
> slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT);
>
> if (pp != NULL) {
> - uvm_lock_pageq();
> uvm_pagefree(pp);
> - uvm_unlock_pageq();
> } else if (slot != 0) {
> swpgonlydelta++;
> }
> Index: uvm/uvm_object.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_object.c,v
> diff -u -p -r1.26 uvm_object.c
> --- uvm/uvm_object.c 19 Feb 2025 11:10:54 -0000 1.26
> +++ uvm/uvm_object.c 19 Feb 2025 11:13:44 -0000
> @@ -238,9 +238,7 @@ uvm_obj_free(struct uvm_object *uobj)
> */
> atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
> pg->uobject = NULL;
> - uvm_lock_pageq();
> uvm_pageclean(pg);
> - uvm_unlock_pageq();
> TAILQ_INSERT_TAIL(&pgl, pg, pageq);
> }
> uvm_pglistfree(&pgl);
> Index: uvm/uvm_page.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_page.c,v
> diff -u -p -r1.181 uvm_page.c
> --- uvm/uvm_page.c 19 Feb 2025 11:10:54 -0000 1.181
> +++ uvm/uvm_page.c 20 Feb 2025 15:17:34 -0000
> @@ -947,7 +947,6 @@ uvm_pagerealloc(struct vm_page *pg, stru
> * uvm_pageclean: clean page
> *
> * => erase page's identity (i.e. remove from object)
> - * => caller must lock page queues if `pg' is managed
> * => assumes all valid mappings of pg are gone
> */
> void
> @@ -955,10 +954,6 @@ uvm_pageclean(struct vm_page *pg)
> {
> u_int flags_to_clear = 0;
>
> - if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) &&
> - (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject)))
> - MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
> -
> #ifdef DEBUG
> if (pg->uobject == (void *)0xdeadbeef &&
> pg->uanon == (void *)0xdeadbeef) {
> @@ -982,14 +977,18 @@ uvm_pageclean(struct vm_page *pg)
> /*
> * now remove the page from the queues
> */
> - uvm_pagedequeue(pg);
> + if (pg->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) {
> + uvm_lock_pageq();
> + uvm_pagedequeue(pg);
> + uvm_unlock_pageq();
> + }
>
> /*
> * if the page was wired, unwire it now.
> */
> if (pg->wire_count) {
> pg->wire_count = 0;
> - uvmexp.wired--;
> + atomic_dec_int(&uvmexp.wired);
> }
> if (pg->uanon) {
> pg->uanon->an_page = NULL;
> @@ -1235,7 +1234,7 @@ uvm_pagewire(struct vm_page *pg)
>
> if (pg->wire_count == 0) {
> uvm_pagedequeue(pg);
> - uvmexp.wired++;
> + atomic_inc_int(&uvmexp.wired);
> }
> pg->wire_count++;
> }
> @@ -1255,7 +1254,7 @@ uvm_pageunwire(struct vm_page *pg)
> pg->wire_count--;
> if (pg->wire_count == 0) {
> uvm_pageactivate(pg);
> - uvmexp.wired--;
> + atomic_dec_int(&uvmexp.wired);
> }
> }
>
> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -r1.134 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c 25 Jan 2025 08:55:52 -0000 1.134
> +++ uvm/uvm_pdaemon.c 19 Feb 2025 11:13:00 -0000
> @@ -596,6 +596,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>
> /* zap all mappings with pmap_page_protect... */
> pmap_page_protect(p, PROT_NONE);
> + /* dequeue first to prevent lock recursion */
> + uvm_pagedequeue(p);
> uvm_pagefree(p);
> freed++;
>
> @@ -853,6 +855,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> anon = NULL;
> uvm_lock_pageq();
> nextpg = TAILQ_NEXT(p, pageq);
> + /* dequeue first to prevent lock recursion */
> + uvm_pagedequeue(p);
> /* free released page */
> uvm_pagefree(p);
> } else { /* page was not released during I/O */
> @@ -1055,7 +1059,6 @@ uvmpd_drop(struct pglist *pglst)
> struct uvm_object * uobj = p->uobject;
>
> rw_enter(uobj->vmobjlock, RW_WRITE);
> - uvm_lock_pageq();
> /*
> * we now have the page queues locked.
> * the page is not busy. if the page is clean we
> @@ -1071,7 +1074,6 @@ uvmpd_drop(struct pglist *pglst)
> pmap_page_protect(p, PROT_NONE);
> uvm_pagefree(p);
> }
> - uvm_unlock_pageq();
> rw_exit(uobj->vmobjlock);
> }
> }
> Index: uvm/uvm_swap.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
> diff -u -p -r1.173 uvm_swap.c
> --- uvm/uvm_swap.c 7 Nov 2024 09:04:55 -0000 1.173
> +++ uvm/uvm_swap.c 19 Feb 2025 11:13:00 -0000
> @@ -395,10 +395,8 @@ uvm_swap_freepages(struct vm_page **pps,
> return;
> }
>
> - uvm_lock_pageq();
> for (i = 0; i < npages; i++)
> uvm_pagefree(pps[i]);
> - uvm_unlock_pageq();
>
> }
>
> Index: uvm/uvm_vnode.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
> diff -u -p -r1.138 uvm_vnode.c
> --- uvm/uvm_vnode.c 27 Dec 2024 12:04:40 -0000 1.138
> +++ uvm/uvm_vnode.c 19 Feb 2025 11:13:00 -0000
> @@ -602,13 +602,11 @@ uvn_flush(struct uvm_object *uobj, voff_
> struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
> struct vm_page *pp, *ptmp;
> struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
> - struct pglist dead;
> int npages, result, lcv;
> boolean_t retval, need_iosync, needs_clean;
> voff_t curoff;
>
> KASSERT(rw_write_held(uobj->vmobjlock));
> - TAILQ_INIT(&dead);
>
> /* get init vals and determine how we are going to traverse object */
> need_iosync = FALSE;
> @@ -696,9 +694,9 @@ uvn_flush(struct uvm_object *uobj, voff_
> continue;
> } else {
> pmap_page_protect(pp, PROT_NONE);
> - /* removed page from object */
> - uvm_pageclean(pp);
> - TAILQ_INSERT_HEAD(&dead, pp, pageq);
> + /* dequeue to prevent lock recursion */
> + uvm_pagedequeue(pp);
> + uvm_pagefree(pp);
> }
> }
> continue;
> @@ -830,8 +828,9 @@ ReTry:
> retval = FALSE;
> }
> pmap_page_protect(ptmp, PROT_NONE);
> - uvm_pageclean(ptmp);
> - TAILQ_INSERT_TAIL(&dead, ptmp, pageq);
> + /* dequeue first to prevent lock recursion */
> + uvm_pagedequeue(ptmp);
> + uvm_pagefree(ptmp);
> }
>
> } /* end of "lcv" for loop */
> @@ -852,8 +851,6 @@ ReTry:
> wakeup(&uvn->u_flags);
> uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
> }
> -
> - uvm_pglistfree(&dead);
>
> return retval;
> }
> Index: uvm/uvmexp.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> diff -u -p -r1.16 uvmexp.h
> --- uvm/uvmexp.h 25 Nov 2024 13:06:25 -0000 1.16
> +++ uvm/uvmexp.h 20 Feb 2025 15:18:17 -0000
> @@ -61,7 +61,7 @@ struct uvmexp {
> int active; /* [L] # of active pages */
> int inactive; /* [L] # of pages that we free'd but may want back */
> int paging; /* [a] # of pages in the process of being paged out */
> - int wired; /* number of wired pages */
> + int wired; /* [a] number of wired pages */
>
> int zeropages; /* [F] number of zero'd pages */
> int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */
>
>
>
Push UVM's pageqlock inside uvm_pagefree()