From: Martin Pieuchot Subject: Re: Push UVM's pageqlock inside uvm_pagefree() To: Peter Hessler Cc: tech@openbsd.org Date: Wed, 26 Feb 2025 15:11:01 +0100 On 25/02/25(Tue) 12:38, Peter Hessler wrote: > On 2025 Feb 21 (Fri) at 10:55:42 +0100 (+0100), Martin Pieuchot wrote: > :On 20/02/25(Thu) 16:21, Martin Pieuchot wrote: > :> Hey Mark, > :> > :> Thanks for the review. > :> > :> On 19/02/25(Wed) 15:08, Mark Kettenis wrote: > :> > > Date: Wed, 19 Feb 2025 13:11:09 +0100 > :> > > From: Martin Pieuchot > :> > > > :> > > Here's the first step to reduce the contention on the global pageqlock > :> > > mutex. The lock is necessary to guarantee the integrity of the global > :> > > LRUs. So it is only required around uvm_pagedequeue(). > :> > > > :> > > This diff already makes a noticeable difference. ok? > :> > > :> > Possibly not ok. > :> > > :> > Is it possible that uvm_pageclean() gets called simulatinously by two > :> > different CPUs for the same page? I'm not 100% sure that isn't > :> > possible. > :> > :> If the page is associated with an `anon' or `uobject' we already assert > :> that the, so called, owner lock, is held. So it is not possible. > :> > :> > But if you can convince me that that isn't possible, there > :> > are still a few issues: > :> > are still a few issues: > :> > > :> > 1. Probably should remove the > :> > > :> > => caller must lock page queues if `pg' is managed > :> > > :> > bit from the comment. > :> > :> Indeed. > :> > :> > 2. I think this means that uvmexp.wired-- now happens without holding > :> > the uvmexp.pageqlock. So that counter may go off and that in turn > :> > may cause problems in the pagedaemon. Maybe we need to use atomic > :> > ops for it? > :> > :> I agree, we do. > :> > :> > 3. I think you're removing an optimization in uvn_flush(). That > :> > function would accumulate cleaned pages and return them to > :> > pmemrange in one go. That potentially avoids lots of unlock/relock > :> > cases. > :> > :> This is now done via per-CPU magazines. I committed a similar change to > :> uvm_anfree(). In my measurements it decreases contention on the fpageq > :> lock because we do not replenish a magazine, and give back pages to the > :> pmemrange allocator, every time uvn_flush() is called. > :> > :> > 4. The current code has some cases that do: > :> > > :> > uvm_lock_pageq(); > :> > pmap_page_protect(pg, PROT_NONE); > :> > uvm_pagefree(pg); > :> > uvm_unlock_pageq(); > :> > > :> > I'm wondering if that is trying to stop the case where we remove > :> > the page from the pmap, but another CPU tries to access the page > :> > and fault it back in. So what prevents that from happening once > :> > you no longer grab the lock? > :> > :> I like your question. > :> > :> Note that currently the pageq lock doesn't prevent another CPU to fault > :> between the PROT_NONE and uvm_pagefree(). When a fault occurs, the VA > :> is translated into an `anon' which might or might not be associated to a > :> physical page. > :> In the fault handler accessing `anon->an_page' is done while holding the > :> corresponding anon lock which is the lock of the amap it belongs to. > :> > :> In other words, what prevents `pg' to be faulted back in is the anon > :> lock which is held during uvm_pageclean(). > :> > :> The same logic applies to managed pages linked to an uobj except that the > :> lock is `vmobjlock'. > :> > :> Here's an updated diff with the comment removed and using atomic > :> operations for uvmexp.wired. > : > :I missed a chunk in the previous diff when rebasing. Complete diff > :below. > : > :ok? > : > > Tested on the arm64.p bulk builders, no issues seen. Thanks Peter. I've also received a couple of positive test reports off-list. Any ok? Index: uvm/uvm_anon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_anon.c,v diff -u -p -r1.61 uvm_anon.c --- uvm/uvm_anon.c 27 Dec 2024 12:04:40 -0000 1.61 +++ uvm/uvm_anon.c 26 Feb 2025 13:57:00 -0000 @@ -106,14 +106,10 @@ uvm_anfree_list(struct vm_anon *anon, st * clean page, and put it on pglist * for later freeing. */ - uvm_lock_pageq(); uvm_pageclean(pg); - uvm_unlock_pageq(); TAILQ_INSERT_HEAD(pgl, pg, pageq); } else { - uvm_lock_pageq(); /* lock out pagedaemon */ uvm_pagefree(pg); /* bye bye */ - uvm_unlock_pageq(); /* free the daemon */ } } else { if (anon->an_swslot != 0 && anon->an_swslot != SWSLOT_BAD) { @@ -249,10 +245,8 @@ uvm_anon_release(struct vm_anon *anon) KASSERT(pg->uanon == anon); KASSERT(anon->an_ref == 0); - uvm_lock_pageq(); pmap_page_protect(pg, PROT_NONE); uvm_pagefree(pg); - uvm_unlock_pageq(); KASSERT(anon->an_page == NULL); lock = anon->an_lock; uvm_anon_dropswap(anon); Index: uvm/uvm_aobj.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v diff -u -p -r1.115 uvm_aobj.c --- uvm/uvm_aobj.c 27 Dec 2024 12:04:40 -0000 1.115 +++ uvm/uvm_aobj.c 26 Feb 2025 13:57:00 -0000 @@ -839,9 +839,7 @@ uao_detach(struct uvm_object *uobj) continue; } uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT); - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); } /* @@ -957,10 +955,7 @@ uao_flush(struct uvm_object *uobj, voff_ * because we need to update swap accounting anyway. */ uao_dropswap(uobj, pg->offset >> PAGE_SHIFT); - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); - continue; default: panic("uao_flush: weird flags"); @@ -1179,9 +1174,7 @@ uao_get(struct uvm_object *uobj, voff_t atomic_clearbits_int(&ptmp->pg_flags, PG_WANTED|PG_BUSY); UVM_PAGE_OWN(ptmp, NULL); - uvm_lock_pageq(); uvm_pagefree(ptmp); - uvm_unlock_pageq(); rw_exit(uobj->vmobjlock); return rv; Index: uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v diff -u -p -r1.164 uvm_fault.c --- uvm/uvm_fault.c 25 Feb 2025 11:29:17 -0000 1.164 +++ uvm/uvm_fault.c 26 Feb 2025 13:57:00 -0000 @@ -417,9 +417,7 @@ uvmfault_anonget(struct uvm_faultinfo *u * cannot be mapped and thus no need to * pmap_page_protect() it. */ - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); if (locked) { uvmfault_unlockall(ufi, NULL, NULL); Index: uvm/uvm_km.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_km.c,v diff -u -p -r1.155 uvm_km.c --- uvm/uvm_km.c 1 Nov 2024 20:26:18 -0000 1.155 +++ uvm/uvm_km.c 26 Feb 2025 13:57:00 -0000 @@ -270,9 +270,7 @@ uvm_km_pgremove(struct uvm_object *uobj, slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT); if (pp != NULL) { - uvm_lock_pageq(); uvm_pagefree(pp); - uvm_unlock_pageq(); } else if (slot != 0) { swpgonlydelta++; } Index: uvm/uvm_object.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_object.c,v diff -u -p -r1.26 uvm_object.c --- uvm/uvm_object.c 19 Feb 2025 11:10:54 -0000 1.26 +++ uvm/uvm_object.c 26 Feb 2025 13:57:00 -0000 @@ -238,9 +238,7 @@ uvm_obj_free(struct uvm_object *uobj) */ atomic_clearbits_int(&pg->pg_flags, PG_TABLED); pg->uobject = NULL; - uvm_lock_pageq(); uvm_pageclean(pg); - uvm_unlock_pageq(); TAILQ_INSERT_TAIL(&pgl, pg, pageq); } uvm_pglistfree(&pgl); Index: uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v diff -u -p -r1.181 uvm_page.c --- uvm/uvm_page.c 19 Feb 2025 11:10:54 -0000 1.181 +++ uvm/uvm_page.c 26 Feb 2025 13:57:00 -0000 @@ -863,9 +863,7 @@ uvm_pagerealloc_multi(struct uvm_object uvm_pagecopy(tpg, pg); KASSERT(tpg->wire_count == 1); tpg->wire_count = 0; - uvm_lock_pageq(); uvm_pagefree(tpg); - uvm_unlock_pageq(); uvm_pagealloc_pg(pg, obj, offset, NULL); } } @@ -947,7 +945,6 @@ uvm_pagerealloc(struct vm_page *pg, stru * uvm_pageclean: clean page * * => erase page's identity (i.e. remove from object) - * => caller must lock page queues if `pg' is managed * => assumes all valid mappings of pg are gone */ void @@ -955,10 +952,6 @@ uvm_pageclean(struct vm_page *pg) { u_int flags_to_clear = 0; - if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && - (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); - #ifdef DEBUG if (pg->uobject == (void *)0xdeadbeef && pg->uanon == (void *)0xdeadbeef) { @@ -982,14 +975,18 @@ uvm_pageclean(struct vm_page *pg) /* * now remove the page from the queues */ - uvm_pagedequeue(pg); + if (pg->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) { + uvm_lock_pageq(); + uvm_pagedequeue(pg); + uvm_unlock_pageq(); + } /* * if the page was wired, unwire it now. */ if (pg->wire_count) { pg->wire_count = 0; - uvmexp.wired--; + atomic_dec_int(&uvmexp.wired); } if (pg->uanon) { pg->uanon->an_page = NULL; @@ -1235,7 +1232,7 @@ uvm_pagewire(struct vm_page *pg) if (pg->wire_count == 0) { uvm_pagedequeue(pg); - uvmexp.wired++; + atomic_inc_int(&uvmexp.wired); } pg->wire_count++; } @@ -1255,7 +1252,7 @@ uvm_pageunwire(struct vm_page *pg) pg->wire_count--; if (pg->wire_count == 0) { uvm_pageactivate(pg); - uvmexp.wired--; + atomic_dec_int(&uvmexp.wired); } } Index: uvm/uvm_pager.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.c,v diff -u -p -r1.93 uvm_pager.c --- uvm/uvm_pager.c 25 Nov 2024 12:51:00 -0000 1.93 +++ uvm/uvm_pager.c 26 Feb 2025 13:57:00 -0000 @@ -761,7 +761,6 @@ uvm_aio_aiodone_pages(struct vm_page **p anon_disposed = (pg->pg_flags & PG_RELEASED) != 0; KASSERT(!anon_disposed || pg->uobject != NULL || pg->uanon->an_ref == 0); - uvm_lock_pageq(); /* * if this was a successful write, @@ -777,11 +776,9 @@ uvm_aio_aiodone_pages(struct vm_page **p * unlock everything for this page now. */ if (pg->uobject == NULL && anon_disposed) { - uvm_unlock_pageq(); uvm_anon_release(pg->uanon); } else { uvm_page_unbusy(&pg, 1); - uvm_unlock_pageq(); rw_exit(slock); } } Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -r1.134 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 25 Jan 2025 08:55:52 -0000 1.134 +++ uvm/uvm_pdaemon.c 26 Feb 2025 13:57:00 -0000 @@ -596,6 +596,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* zap all mappings with pmap_page_protect... */ pmap_page_protect(p, PROT_NONE); + /* dequeue first to prevent lock recursion */ + uvm_pagedequeue(p); uvm_pagefree(p); freed++; @@ -853,6 +855,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc * anon = NULL; uvm_lock_pageq(); nextpg = TAILQ_NEXT(p, pageq); + /* dequeue first to prevent lock recursion */ + uvm_pagedequeue(p); /* free released page */ uvm_pagefree(p); } else { /* page was not released during I/O */ @@ -1055,7 +1059,6 @@ uvmpd_drop(struct pglist *pglst) struct uvm_object * uobj = p->uobject; rw_enter(uobj->vmobjlock, RW_WRITE); - uvm_lock_pageq(); /* * we now have the page queues locked. * the page is not busy. if the page is clean we @@ -1071,7 +1074,6 @@ uvmpd_drop(struct pglist *pglst) pmap_page_protect(p, PROT_NONE); uvm_pagefree(p); } - uvm_unlock_pageq(); rw_exit(uobj->vmobjlock); } } Index: uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v diff -u -p -r1.173 uvm_swap.c --- uvm/uvm_swap.c 7 Nov 2024 09:04:55 -0000 1.173 +++ uvm/uvm_swap.c 26 Feb 2025 13:57:00 -0000 @@ -395,10 +395,8 @@ uvm_swap_freepages(struct vm_page **pps, return; } - uvm_lock_pageq(); for (i = 0; i < npages; i++) uvm_pagefree(pps[i]); - uvm_unlock_pageq(); } Index: uvm/uvm_vnode.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v diff -u -p -r1.138 uvm_vnode.c --- uvm/uvm_vnode.c 27 Dec 2024 12:04:40 -0000 1.138 +++ uvm/uvm_vnode.c 26 Feb 2025 13:57:00 -0000 @@ -602,13 +602,11 @@ uvn_flush(struct uvm_object *uobj, voff_ struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; struct vm_page *pp, *ptmp; struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; - struct pglist dead; int npages, result, lcv; boolean_t retval, need_iosync, needs_clean; voff_t curoff; KASSERT(rw_write_held(uobj->vmobjlock)); - TAILQ_INIT(&dead); /* get init vals and determine how we are going to traverse object */ need_iosync = FALSE; @@ -696,9 +694,9 @@ uvn_flush(struct uvm_object *uobj, voff_ continue; } else { pmap_page_protect(pp, PROT_NONE); - /* removed page from object */ - uvm_pageclean(pp); - TAILQ_INSERT_HEAD(&dead, pp, pageq); + /* dequeue to prevent lock recursion */ + uvm_pagedequeue(pp); + uvm_pagefree(pp); } } continue; @@ -830,8 +828,9 @@ ReTry: retval = FALSE; } pmap_page_protect(ptmp, PROT_NONE); - uvm_pageclean(ptmp); - TAILQ_INSERT_TAIL(&dead, ptmp, pageq); + /* dequeue first to prevent lock recursion */ + uvm_pagedequeue(ptmp); + uvm_pagefree(ptmp); } } /* end of "lcv" for loop */ @@ -852,8 +851,6 @@ ReTry: wakeup(&uvn->u_flags); uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED); } - - uvm_pglistfree(&dead); return retval; } Index: uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v diff -u -p -r1.17 uvmexp.h --- uvm/uvmexp.h 25 Feb 2025 11:29:17 -0000 1.17 +++ uvm/uvmexp.h 26 Feb 2025 13:57:00 -0000 @@ -61,7 +61,7 @@ struct uvmexp { int active; /* [L] # of active pages */ int inactive; /* [L] # of pages that we free'd but may want back */ int paging; /* [a] # of pages in the process of being paged out */ - int wired; /* number of wired pages */ + int wired; /* [a] number of wired pages */ int zeropages; /* [F] number of zero'd pages */ int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */