Download raw body.
Push UVM's pageqlock inside uvm_pagefree()
On 25/02/25(Tue) 12:38, Peter Hessler wrote:
> On 2025 Feb 21 (Fri) at 10:55:42 +0100 (+0100), Martin Pieuchot wrote:
> :On 20/02/25(Thu) 16:21, Martin Pieuchot wrote:
> :> Hey Mark,
> :>
> :> Thanks for the review.
> :>
> :> On 19/02/25(Wed) 15:08, Mark Kettenis wrote:
> :> > > Date: Wed, 19 Feb 2025 13:11:09 +0100
> :> > > From: Martin Pieuchot <mpi@grenadille.net>
> :> > >
> :> > > Here's the first step to reduce the contention on the global pageqlock
> :> > > mutex. The lock is necessary to guarantee the integrity of the global
> :> > > LRUs. So it is only required around uvm_pagedequeue().
> :> > >
> :> > > This diff already makes a noticeable difference. ok?
> :> >
> :> > Possibly not ok.
> :> >
> :> > Is it possible that uvm_pageclean() gets called simulatinously by two
> :> > different CPUs for the same page? I'm not 100% sure that isn't
> :> > possible.
> :>
> :> If the page is associated with an `anon' or `uobject' we already assert
> :> that the, so called, owner lock, is held. So it is not possible.
> :>
> :> > But if you can convince me that that isn't possible, there
> :> > are still a few issues:
> :> > are still a few issues:
> :> >
> :> > 1. Probably should remove the
> :> >
> :> > => caller must lock page queues if `pg' is managed
> :> >
> :> > bit from the comment.
> :>
> :> Indeed.
> :>
> :> > 2. I think this means that uvmexp.wired-- now happens without holding
> :> > the uvmexp.pageqlock. So that counter may go off and that in turn
> :> > may cause problems in the pagedaemon. Maybe we need to use atomic
> :> > ops for it?
> :>
> :> I agree, we do.
> :>
> :> > 3. I think you're removing an optimization in uvn_flush(). That
> :> > function would accumulate cleaned pages and return them to
> :> > pmemrange in one go. That potentially avoids lots of unlock/relock
> :> > cases.
> :>
> :> This is now done via per-CPU magazines. I committed a similar change to
> :> uvm_anfree(). In my measurements it decreases contention on the fpageq
> :> lock because we do not replenish a magazine, and give back pages to the
> :> pmemrange allocator, every time uvn_flush() is called.
> :>
> :> > 4. The current code has some cases that do:
> :> >
> :> > uvm_lock_pageq();
> :> > pmap_page_protect(pg, PROT_NONE);
> :> > uvm_pagefree(pg);
> :> > uvm_unlock_pageq();
> :> >
> :> > I'm wondering if that is trying to stop the case where we remove
> :> > the page from the pmap, but another CPU tries to access the page
> :> > and fault it back in. So what prevents that from happening once
> :> > you no longer grab the lock?
> :>
> :> I like your question.
> :>
> :> Note that currently the pageq lock doesn't prevent another CPU to fault
> :> between the PROT_NONE and uvm_pagefree(). When a fault occurs, the VA
> :> is translated into an `anon' which might or might not be associated to a
> :> physical page.
> :> In the fault handler accessing `anon->an_page' is done while holding the
> :> corresponding anon lock which is the lock of the amap it belongs to.
> :>
> :> In other words, what prevents `pg' to be faulted back in is the anon
> :> lock which is held during uvm_pageclean().
> :>
> :> The same logic applies to managed pages linked to an uobj except that the
> :> lock is `vmobjlock'.
> :>
> :> Here's an updated diff with the comment removed and using atomic
> :> operations for uvmexp.wired.
> :
> :I missed a chunk in the previous diff when rebasing. Complete diff
> :below.
> :
> :ok?
> :
>
> Tested on the arm64.p bulk builders, no issues seen.
Thanks Peter. I've also received a couple of positive test reports
off-list.
Any ok?
Index: uvm/uvm_anon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
diff -u -p -r1.61 uvm_anon.c
--- uvm/uvm_anon.c 27 Dec 2024 12:04:40 -0000 1.61
+++ uvm/uvm_anon.c 26 Feb 2025 13:57:00 -0000
@@ -106,14 +106,10 @@ uvm_anfree_list(struct vm_anon *anon, st
* clean page, and put it on pglist
* for later freeing.
*/
- uvm_lock_pageq();
uvm_pageclean(pg);
- uvm_unlock_pageq();
TAILQ_INSERT_HEAD(pgl, pg, pageq);
} else {
- uvm_lock_pageq(); /* lock out pagedaemon */
uvm_pagefree(pg); /* bye bye */
- uvm_unlock_pageq(); /* free the daemon */
}
} else {
if (anon->an_swslot != 0 && anon->an_swslot != SWSLOT_BAD) {
@@ -249,10 +245,8 @@ uvm_anon_release(struct vm_anon *anon)
KASSERT(pg->uanon == anon);
KASSERT(anon->an_ref == 0);
- uvm_lock_pageq();
pmap_page_protect(pg, PROT_NONE);
uvm_pagefree(pg);
- uvm_unlock_pageq();
KASSERT(anon->an_page == NULL);
lock = anon->an_lock;
uvm_anon_dropswap(anon);
Index: uvm/uvm_aobj.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
diff -u -p -r1.115 uvm_aobj.c
--- uvm/uvm_aobj.c 27 Dec 2024 12:04:40 -0000 1.115
+++ uvm/uvm_aobj.c 26 Feb 2025 13:57:00 -0000
@@ -839,9 +839,7 @@ uao_detach(struct uvm_object *uobj)
continue;
}
uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
}
/*
@@ -957,10 +955,7 @@ uao_flush(struct uvm_object *uobj, voff_
* because we need to update swap accounting anyway.
*/
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
-
continue;
default:
panic("uao_flush: weird flags");
@@ -1179,9 +1174,7 @@ uao_get(struct uvm_object *uobj, voff_t
atomic_clearbits_int(&ptmp->pg_flags,
PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(ptmp, NULL);
- uvm_lock_pageq();
uvm_pagefree(ptmp);
- uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
return rv;
Index: uvm/uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
diff -u -p -r1.164 uvm_fault.c
--- uvm/uvm_fault.c 25 Feb 2025 11:29:17 -0000 1.164
+++ uvm/uvm_fault.c 26 Feb 2025 13:57:00 -0000
@@ -417,9 +417,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
* cannot be mapped and thus no need to
* pmap_page_protect() it.
*/
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
if (locked) {
uvmfault_unlockall(ufi, NULL, NULL);
Index: uvm/uvm_km.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
diff -u -p -r1.155 uvm_km.c
--- uvm/uvm_km.c 1 Nov 2024 20:26:18 -0000 1.155
+++ uvm/uvm_km.c 26 Feb 2025 13:57:00 -0000
@@ -270,9 +270,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT);
if (pp != NULL) {
- uvm_lock_pageq();
uvm_pagefree(pp);
- uvm_unlock_pageq();
} else if (slot != 0) {
swpgonlydelta++;
}
Index: uvm/uvm_object.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_object.c,v
diff -u -p -r1.26 uvm_object.c
--- uvm/uvm_object.c 19 Feb 2025 11:10:54 -0000 1.26
+++ uvm/uvm_object.c 26 Feb 2025 13:57:00 -0000
@@ -238,9 +238,7 @@ uvm_obj_free(struct uvm_object *uobj)
*/
atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
pg->uobject = NULL;
- uvm_lock_pageq();
uvm_pageclean(pg);
- uvm_unlock_pageq();
TAILQ_INSERT_TAIL(&pgl, pg, pageq);
}
uvm_pglistfree(&pgl);
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
diff -u -p -r1.181 uvm_page.c
--- uvm/uvm_page.c 19 Feb 2025 11:10:54 -0000 1.181
+++ uvm/uvm_page.c 26 Feb 2025 13:57:00 -0000
@@ -863,9 +863,7 @@ uvm_pagerealloc_multi(struct uvm_object
uvm_pagecopy(tpg, pg);
KASSERT(tpg->wire_count == 1);
tpg->wire_count = 0;
- uvm_lock_pageq();
uvm_pagefree(tpg);
- uvm_unlock_pageq();
uvm_pagealloc_pg(pg, obj, offset, NULL);
}
}
@@ -947,7 +945,6 @@ uvm_pagerealloc(struct vm_page *pg, stru
* uvm_pageclean: clean page
*
* => erase page's identity (i.e. remove from object)
- * => caller must lock page queues if `pg' is managed
* => assumes all valid mappings of pg are gone
*/
void
@@ -955,10 +952,6 @@ uvm_pageclean(struct vm_page *pg)
{
u_int flags_to_clear = 0;
- if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) &&
- (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject)))
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
-
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
pg->uanon == (void *)0xdeadbeef) {
@@ -982,14 +975,18 @@ uvm_pageclean(struct vm_page *pg)
/*
* now remove the page from the queues
*/
- uvm_pagedequeue(pg);
+ if (pg->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) {
+ uvm_lock_pageq();
+ uvm_pagedequeue(pg);
+ uvm_unlock_pageq();
+ }
/*
* if the page was wired, unwire it now.
*/
if (pg->wire_count) {
pg->wire_count = 0;
- uvmexp.wired--;
+ atomic_dec_int(&uvmexp.wired);
}
if (pg->uanon) {
pg->uanon->an_page = NULL;
@@ -1235,7 +1232,7 @@ uvm_pagewire(struct vm_page *pg)
if (pg->wire_count == 0) {
uvm_pagedequeue(pg);
- uvmexp.wired++;
+ atomic_inc_int(&uvmexp.wired);
}
pg->wire_count++;
}
@@ -1255,7 +1252,7 @@ uvm_pageunwire(struct vm_page *pg)
pg->wire_count--;
if (pg->wire_count == 0) {
uvm_pageactivate(pg);
- uvmexp.wired--;
+ atomic_dec_int(&uvmexp.wired);
}
}
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
diff -u -p -r1.93 uvm_pager.c
--- uvm/uvm_pager.c 25 Nov 2024 12:51:00 -0000 1.93
+++ uvm/uvm_pager.c 26 Feb 2025 13:57:00 -0000
@@ -761,7 +761,6 @@ uvm_aio_aiodone_pages(struct vm_page **p
anon_disposed = (pg->pg_flags & PG_RELEASED) != 0;
KASSERT(!anon_disposed || pg->uobject != NULL ||
pg->uanon->an_ref == 0);
- uvm_lock_pageq();
/*
* if this was a successful write,
@@ -777,11 +776,9 @@ uvm_aio_aiodone_pages(struct vm_page **p
* unlock everything for this page now.
*/
if (pg->uobject == NULL && anon_disposed) {
- uvm_unlock_pageq();
uvm_anon_release(pg->uanon);
} else {
uvm_page_unbusy(&pg, 1);
- uvm_unlock_pageq();
rw_exit(slock);
}
}
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
diff -u -p -r1.134 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 25 Jan 2025 08:55:52 -0000 1.134
+++ uvm/uvm_pdaemon.c 26 Feb 2025 13:57:00 -0000
@@ -596,6 +596,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
/* zap all mappings with pmap_page_protect... */
pmap_page_protect(p, PROT_NONE);
+ /* dequeue first to prevent lock recursion */
+ uvm_pagedequeue(p);
uvm_pagefree(p);
freed++;
@@ -853,6 +855,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
anon = NULL;
uvm_lock_pageq();
nextpg = TAILQ_NEXT(p, pageq);
+ /* dequeue first to prevent lock recursion */
+ uvm_pagedequeue(p);
/* free released page */
uvm_pagefree(p);
} else { /* page was not released during I/O */
@@ -1055,7 +1059,6 @@ uvmpd_drop(struct pglist *pglst)
struct uvm_object * uobj = p->uobject;
rw_enter(uobj->vmobjlock, RW_WRITE);
- uvm_lock_pageq();
/*
* we now have the page queues locked.
* the page is not busy. if the page is clean we
@@ -1071,7 +1074,6 @@ uvmpd_drop(struct pglist *pglst)
pmap_page_protect(p, PROT_NONE);
uvm_pagefree(p);
}
- uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
}
}
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
diff -u -p -r1.173 uvm_swap.c
--- uvm/uvm_swap.c 7 Nov 2024 09:04:55 -0000 1.173
+++ uvm/uvm_swap.c 26 Feb 2025 13:57:00 -0000
@@ -395,10 +395,8 @@ uvm_swap_freepages(struct vm_page **pps,
return;
}
- uvm_lock_pageq();
for (i = 0; i < npages; i++)
uvm_pagefree(pps[i]);
- uvm_unlock_pageq();
}
Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
diff -u -p -r1.138 uvm_vnode.c
--- uvm/uvm_vnode.c 27 Dec 2024 12:04:40 -0000 1.138
+++ uvm/uvm_vnode.c 26 Feb 2025 13:57:00 -0000
@@ -602,13 +602,11 @@ uvn_flush(struct uvm_object *uobj, voff_
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
struct vm_page *pp, *ptmp;
struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
- struct pglist dead;
int npages, result, lcv;
boolean_t retval, need_iosync, needs_clean;
voff_t curoff;
KASSERT(rw_write_held(uobj->vmobjlock));
- TAILQ_INIT(&dead);
/* get init vals and determine how we are going to traverse object */
need_iosync = FALSE;
@@ -696,9 +694,9 @@ uvn_flush(struct uvm_object *uobj, voff_
continue;
} else {
pmap_page_protect(pp, PROT_NONE);
- /* removed page from object */
- uvm_pageclean(pp);
- TAILQ_INSERT_HEAD(&dead, pp, pageq);
+ /* dequeue to prevent lock recursion */
+ uvm_pagedequeue(pp);
+ uvm_pagefree(pp);
}
}
continue;
@@ -830,8 +828,9 @@ ReTry:
retval = FALSE;
}
pmap_page_protect(ptmp, PROT_NONE);
- uvm_pageclean(ptmp);
- TAILQ_INSERT_TAIL(&dead, ptmp, pageq);
+ /* dequeue first to prevent lock recursion */
+ uvm_pagedequeue(ptmp);
+ uvm_pagefree(ptmp);
}
} /* end of "lcv" for loop */
@@ -852,8 +851,6 @@ ReTry:
wakeup(&uvn->u_flags);
uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
}
-
- uvm_pglistfree(&dead);
return retval;
}
Index: uvm/uvmexp.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
diff -u -p -r1.17 uvmexp.h
--- uvm/uvmexp.h 25 Feb 2025 11:29:17 -0000 1.17
+++ uvm/uvmexp.h 26 Feb 2025 13:57:00 -0000
@@ -61,7 +61,7 @@ struct uvmexp {
int active; /* [L] # of active pages */
int inactive; /* [L] # of pages that we free'd but may want back */
int paging; /* [a] # of pages in the process of being paged out */
- int wired; /* number of wired pages */
+ int wired; /* [a] number of wired pages */
int zeropages; /* [F] number of zero'd pages */
int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */
Push UVM's pageqlock inside uvm_pagefree()