Download raw body.
UVM performance improvements
Diff below includes all the changes I made during the last months to get
rid of lock contention and enable parallel fault handling.
With it I measured a 5% performance improvement and a reduction of %sys
time of 25% on 24CPU amd64. On my 80CPU arm64 performances also improve
by 5% and %sys time increase by ~5%.
I'd be happy to hear about more tests, especially on bulk builds and on
other architectures.
Thanks for your help,
Martin
Index: uvm/uvm_amap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_amap.c,v
diff -u -p -r1.96 uvm_amap.c
--- uvm/uvm_amap.c 4 Dec 2024 09:19:11 -0000 1.96
+++ uvm/uvm_amap.c 4 Mar 2025 13:02:35 -0000
@@ -823,9 +823,7 @@ ReStart:
*/
atomic_clearbits_int(&npg->pg_flags, PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(npg, NULL);
- uvm_lock_pageq();
uvm_pageactivate(npg);
- uvm_unlock_pageq();
}
}
amap_unlock(amap);
Index: uvm/uvm_anon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
diff -u -p -r1.61 uvm_anon.c
--- uvm/uvm_anon.c 27 Dec 2024 12:04:40 -0000 1.61
+++ uvm/uvm_anon.c 4 Mar 2025 13:02:35 -0000
@@ -106,14 +106,10 @@ uvm_anfree_list(struct vm_anon *anon, st
* clean page, and put it on pglist
* for later freeing.
*/
- uvm_lock_pageq();
uvm_pageclean(pg);
- uvm_unlock_pageq();
TAILQ_INSERT_HEAD(pgl, pg, pageq);
} else {
- uvm_lock_pageq(); /* lock out pagedaemon */
uvm_pagefree(pg); /* bye bye */
- uvm_unlock_pageq(); /* free the daemon */
}
} else {
if (anon->an_swslot != 0 && anon->an_swslot != SWSLOT_BAD) {
@@ -181,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st
* anon was freed.
*/
return FALSE;
+ case ENOLCK:
+ /* Should not be possible. */
default:
#ifdef DIAGNOSTIC
panic("anon_pagein: uvmfault_anonget -> %d", rv);
@@ -202,9 +200,7 @@ uvm_anon_pagein(struct vm_amap *amap, st
/*
* Deactivate the page (to put it on a page queue).
*/
- uvm_lock_pageq();
uvm_pagedeactivate(pg);
- uvm_unlock_pageq();
rw_exit(anon->an_lock);
return FALSE;
@@ -249,10 +245,8 @@ uvm_anon_release(struct vm_anon *anon)
KASSERT(pg->uanon == anon);
KASSERT(anon->an_ref == 0);
- uvm_lock_pageq();
pmap_page_protect(pg, PROT_NONE);
uvm_pagefree(pg);
- uvm_unlock_pageq();
KASSERT(anon->an_page == NULL);
lock = anon->an_lock;
uvm_anon_dropswap(anon);
Index: uvm/uvm_aobj.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
diff -u -p -r1.115 uvm_aobj.c
--- uvm/uvm_aobj.c 27 Dec 2024 12:04:40 -0000 1.115
+++ uvm/uvm_aobj.c 4 Mar 2025 13:02:35 -0000
@@ -839,9 +839,7 @@ uao_detach(struct uvm_object *uobj)
continue;
}
uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
}
/*
@@ -921,18 +919,10 @@ uao_flush(struct uvm_object *uobj, voff_
* XXX in the future.
*/
case PGO_CLEANIT|PGO_FREE:
- /* FALLTHROUGH */
case PGO_CLEANIT|PGO_DEACTIVATE:
- /* FALLTHROUGH */
case PGO_DEACTIVATE:
deactivate_it:
- if (pg->wire_count != 0)
- continue;
-
- uvm_lock_pageq();
uvm_pagedeactivate(pg);
- uvm_unlock_pageq();
-
continue;
case PGO_FREE:
/*
@@ -957,10 +947,7 @@ uao_flush(struct uvm_object *uobj, voff_
* because we need to update swap accounting anyway.
*/
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
-
continue;
default:
panic("uao_flush: weird flags");
@@ -1179,9 +1166,7 @@ uao_get(struct uvm_object *uobj, voff_t
atomic_clearbits_int(&ptmp->pg_flags,
PG_WANTED|PG_BUSY);
UVM_PAGE_OWN(ptmp, NULL);
- uvm_lock_pageq();
uvm_pagefree(ptmp);
- uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
return rv;
@@ -1410,9 +1395,7 @@ uao_pagein_page(struct uvm_aobj *aobj, i
/*
* deactivate the page (to put it on a page queue).
*/
- uvm_lock_pageq();
uvm_pagedeactivate(pg);
- uvm_unlock_pageq();
return FALSE;
}
Index: uvm/uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
diff -u -p -r1.164 uvm_fault.c
--- uvm/uvm_fault.c 25 Feb 2025 11:29:17 -0000 1.164
+++ uvm/uvm_fault.c 4 Mar 2025 13:03:16 -0000
@@ -43,6 +43,8 @@
#include <uvm/uvm.h>
+int pfault = 0; /* resolve fault in parallel */
+
/*
*
* a word on page faults:
@@ -183,11 +185,7 @@ uvmfault_anonflush(struct vm_anon **anon
KASSERT(rw_lock_held(anons[lcv]->an_lock));
pg = anons[lcv]->an_page;
if (pg && (pg->pg_flags & PG_BUSY) == 0) {
- uvm_lock_pageq();
- if (pg->wire_count == 0) {
- uvm_pagedeactivate(pg);
- }
- uvm_unlock_pageq();
+ uvm_pagedeactivate(pg);
}
}
}
@@ -277,6 +275,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
struct vm_anon *anon)
{
struct vm_page *pg;
+ int lock_type;
int error;
KASSERT(rw_lock_held(anon->an_lock));
@@ -305,6 +304,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
/*
* Is page resident? Make sure it is not busy/released.
*/
+ lock_type = rw_status(anon->an_lock);
if (pg) {
KASSERT(pg->pg_flags & PQ_ANON);
KASSERT(pg->uanon == anon);
@@ -326,8 +326,13 @@ uvmfault_anonget(struct uvm_faultinfo *u
uvm_pagewait(pg, anon->an_lock, "anonget");
} else {
/*
- * No page, therefore allocate one.
+ * No page, therefore allocate one. A write lock is
+ * required for this. If the caller didn't supply
+ * one, fail now and have them retry.
*/
+ if (lock_type == RW_READ) {
+ return ENOLCK;
+ }
pg = uvm_pagealloc(NULL, 0, anon, 0);
if (pg == NULL) {
/* Out of memory. Wait a little. */
@@ -417,9 +422,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
* cannot be mapped and thus no need to
* pmap_page_protect() it.
*/
- uvm_lock_pageq();
uvm_pagefree(pg);
- uvm_unlock_pageq();
if (locked) {
uvmfault_unlockall(ufi, NULL, NULL);
@@ -437,9 +440,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
* We have successfully read the page, activate it.
*/
pmap_clear_modify(pg);
- uvm_lock_pageq();
uvm_pageactivate(pg);
- uvm_unlock_pageq();
atomic_clearbits_int(&pg->pg_flags,
PG_WANTED|PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
@@ -500,6 +501,7 @@ uvmfault_promote(struct uvm_faultinfo *u
if (uobjpage != PGO_DONTCARE)
uobj = uobjpage->uobject;
+ KASSERT(rw_write_held(amap->am_lock));
KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
anon = uvm_analloc();
@@ -611,6 +613,7 @@ struct uvm_faultctx {
boolean_t wired;
paddr_t pa_flags;
boolean_t promote;
+ int upper_lock_type;
int lower_lock_type;
};
@@ -655,11 +658,14 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
flt.access_type = access_type;
flt.narrow = FALSE; /* assume normal fault for now */
flt.wired = FALSE; /* assume non-wired fault for now */
-#if notyet
- flt.lower_lock_type = RW_READ; /* shared lock for now */
-#else
- flt.lower_lock_type = RW_WRITE; /* exclusive lock for now */
-#endif
+ if (pfault) {
+ /* shared lock for now */
+ flt.upper_lock_type = RW_READ;
+ flt.lower_lock_type = RW_READ;
+ } else {
+ flt.upper_lock_type = RW_WRITE;
+ flt.lower_lock_type = RW_WRITE;
+ }
error = ERESTART;
while (error == ERESTART) { /* ReFault: */
@@ -842,7 +848,13 @@ uvm_fault_check(struct uvm_faultinfo *uf
* if we've got an amap then lock it and extract current anons.
*/
if (amap) {
- amap_lock(amap, RW_WRITE);
+ if ((flt->access_type & PROT_WRITE) != 0) {
+ /*
+ * assume we're about to COW.
+ */
+ flt->upper_lock_type = RW_WRITE;
+ }
+ amap_lock(amap, flt->upper_lock_type);
amap_lookups(&ufi->entry->aref,
flt->startva - ufi->entry->start, *ranons, flt->npages);
} else {
@@ -894,6 +906,36 @@ uvm_fault_check(struct uvm_faultinfo *uf
}
/*
+ * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
+ */
+static inline int
+uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap)
+{
+ KASSERT(flt->upper_lock_type == rw_status(amap->am_lock));
+
+ /*
+ * fast path.
+ */
+ if (flt->upper_lock_type == RW_WRITE) {
+ return 0;
+ }
+
+ /*
+ * otherwise try for the upgrade. if we don't get it, unlock
+ * everything, restart the fault and next time around get a writer
+ * lock.
+ */
+ flt->upper_lock_type = RW_WRITE;
+ if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) {
+ counters_inc(uvmexp_counters, flt_noup);
+ return ERESTART;
+ }
+ counters_inc(uvmexp_counters, flt_up);
+ KASSERT(flt->upper_lock_type == rw_status(amap->am_lock));
+ return 0;
+}
+
+/*
* uvm_fault_upper_lookup: look up existing h/w mapping and amap.
*
* iterate range of interest:
@@ -916,9 +958,8 @@ uvm_fault_upper_lookup(struct uvm_faulti
paddr_t pa;
int lcv, entered = 0;
- /* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
/*
* map in the backpages and frontpages we found in the amap in hopes
@@ -956,9 +997,7 @@ uvm_fault_upper_lookup(struct uvm_faulti
*/
if (pg && (pg->pg_flags & (PG_RELEASED|PG_BUSY)) == 0 &&
!pmap_extract(ufi->orig_map->pmap, currva, &pa)) {
- uvm_lock_pageq();
uvm_pageactivate(pg); /* reactivate */
- uvm_unlock_pageq();
counters_inc(uvmexp_counters, flt_namap);
/* No fault-ahead when wired. */
@@ -1000,8 +1039,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
struct vm_page *pg = NULL;
int error, ret;
- /* locked: maps(read), amap, anon */
- KASSERT(rw_write_held(amap->am_lock));
+ KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(anon->an_lock == amap->am_lock);
/*
@@ -1014,6 +1052,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
* if it succeeds, locks are still valid and locked.
* also, if it is OK, then the anon's page is on the queues.
*/
+retry:
error = uvmfault_anonget(ufi, amap, anon);
switch (error) {
case 0:
@@ -1022,11 +1061,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf
case ERESTART:
return ERESTART;
+ case ENOLCK:
+ /* it needs a write lock: retry */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, NULL);
+ return error;
+ }
+ KASSERT(rw_write_held(amap->am_lock));
+ goto retry;
+
default:
return error;
}
- KASSERT(rw_write_held(amap->am_lock));
+ KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(anon->an_lock == amap->am_lock);
/*
@@ -1041,9 +1090,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf
*
* if we are out of anon VM we wait for RAM to become available.
*/
-
if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
/* promoting requires a write lock. */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, NULL);
+ return error;
+ }
KASSERT(rw_write_held(amap->am_lock));
counters_inc(uvmexp_counters, flt_acow);
@@ -1066,6 +1119,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf
KASSERT(oanon->an_ref > 1);
oanon->an_ref--;
+ /*
+ * note: oanon is still locked, as is the new anon. we
+ * need to check for this later when we unlock oanon; if
+ * oanon != anon, we'll have to unlock anon, too.
+ */
+ KASSERT(anon->an_lock == amap->am_lock);
+ KASSERT(oanon->an_lock == amap->am_lock);
+
#if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW)
/*
* If there are multiple threads, either uvm or the
@@ -1080,12 +1141,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf
flt->access_type &= ~PROT_WRITE;
}
#endif
-
- /*
- * note: anon is _not_ locked, but we have the sole references
- * to in from amap.
- * thus, no one can get at it until we are done with it.
- */
} else {
counters_inc(uvmexp_counters, flt_anon);
oanon = anon;
@@ -1124,13 +1179,11 @@ uvm_fault_upper(struct uvm_faultinfo *uf
/*
* ... update the page queues.
*/
- uvm_lock_pageq();
if (flt->wired) {
uvm_pagewire(pg);
} else {
uvm_pageactivate(pg);
}
- uvm_unlock_pageq();
if (flt->wired) {
/*
@@ -1215,11 +1268,7 @@ uvm_fault_lower_lookup(
* are neither busy nor released, so we don't need to check
* for this. we can just directly enter the pages.
*/
- if (pages[lcv]->wire_count == 0) {
- uvm_lock_pageq();
- uvm_pageactivate(pages[lcv]);
- uvm_unlock_pageq();
- }
+ uvm_pageactivate(pages[lcv]);
counters_inc(uvmexp_counters, flt_nomap);
/* No fault-ahead when wired. */
@@ -1248,10 +1297,8 @@ uvm_fault_lower_lookup(
* uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
*/
static inline int
-uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
- struct vm_amap *amap, struct uvm_object *uobj)
+uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj)
{
- KASSERT(uobj != NULL);
KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock));
/*
@@ -1267,7 +1314,6 @@ uvm_fault_lower_upgrade(struct uvm_fault
*/
flt->lower_lock_type = RW_WRITE;
if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) {
- uvmfault_unlockall(ufi, amap, uobj);
counters_inc(uvmexp_counters, flt_noup);
return ERESTART;
}
@@ -1321,11 +1367,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf
* made it BUSY.
*/
- /*
- * locked:
- */
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
@@ -1356,9 +1399,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
/* update rusage counters */
curproc->p_ru.ru_minflt++;
if (uobjpage != PGO_DONTCARE) {
- uvm_lock_pageq();
uvm_pageactivate(uobjpage);
- uvm_unlock_pageq();
}
} else {
error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
@@ -1394,6 +1435,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf
KASSERT(amap != NULL);
/* promoting requires a write lock. */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, uobj);
+ return error;
+ }
KASSERT(rw_write_held(amap->am_lock));
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
@@ -1470,7 +1516,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
* Note: pg is either the uobjpage or the new page in the new anon.
*/
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
@@ -1508,7 +1554,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf
return ERESTART;
}
- uvm_lock_pageq();
if (flt->wired) {
uvm_pagewire(pg);
if (pg->pg_flags & PQ_AOBJ) {
@@ -1530,7 +1575,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf
} else {
uvm_pageactivate(pg);
}
- uvm_unlock_pageq();
if (dropswap)
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
@@ -1574,9 +1618,11 @@ uvm_fault_lower_io(
advice = ufi->entry->advice;
/* Upgrade to a write lock if needed. */
- error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj);
- if (error != 0)
+ error = uvm_fault_lower_upgrade(flt, uobj);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, uobj);
return error;
+ }
uvmfault_unlockall(ufi, amap, NULL);
/* update rusage counters */
@@ -1612,7 +1658,7 @@ uvm_fault_lower_io(
/* re-verify the state of the world. */
locked = uvmfault_relock(ufi);
if (locked && amap != NULL)
- amap_lock(amap, RW_WRITE);
+ amap_lock(amap, flt->upper_lock_type);
/* might be changed */
if (pg != PGO_DONTCARE) {
@@ -1635,9 +1681,7 @@ uvm_fault_lower_io(
/* release the page now, still holding object lock */
if (pg != PGO_DONTCARE) {
- uvm_lock_pageq();
uvm_pageactivate(pg);
- uvm_unlock_pageq();
if (pg->pg_flags & PG_WANTED)
wakeup(pg);
@@ -1739,16 +1783,12 @@ uvm_fault_unwire_locked(vm_map_t map, va
* find the map entry for the current address.
*/
KASSERT(va >= entry->start);
- while (entry && va >= entry->end) {
+ while (va >= entry->end) {
next = RBT_NEXT(uvm_map_addr, entry);
+ KASSERT(next != NULL && next->start <= entry->end);
entry = next;
}
- if (entry == NULL)
- return;
- if (va < entry->start)
- continue;
-
/*
* lock it.
*/
@@ -1771,14 +1811,12 @@ uvm_fault_unwire_locked(vm_map_t map, va
pg = PHYS_TO_VM_PAGE(pa);
if (pg) {
- uvm_lock_pageq();
uvm_pageunwire(pg);
- uvm_unlock_pageq();
}
}
if (oentry != NULL) {
- uvm_map_unlock_entry(oentry);
+ uvm_map_unlock_entry(entry);
}
}
Index: uvm/uvm_glue.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_glue.c,v
diff -u -p -r1.87 uvm_glue.c
--- uvm/uvm_glue.c 28 Oct 2024 08:25:32 -0000 1.87
+++ uvm/uvm_glue.c 4 Mar 2025 13:02:35 -0000
@@ -114,7 +114,7 @@ uvm_vslock(struct proc *p, caddr_t addr,
if (end <= start)
return (EINVAL);
- return uvm_fault_wire(map, start, end, access_type);
+ return uvm_map_pageable(map, start, end, FALSE, 0);
}
/*
@@ -125,13 +125,14 @@ uvm_vslock(struct proc *p, caddr_t addr,
void
uvm_vsunlock(struct proc *p, caddr_t addr, size_t len)
{
+ struct vm_map *map = &p->p_vmspace->vm_map;
vaddr_t start, end;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
KASSERT(end > start);
- uvm_fault_unwire(&p->p_vmspace->vm_map, start, end);
+ uvm_map_pageable(map, start, end, TRUE, 0);
}
/*
Index: uvm/uvm_km.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
diff -u -p -r1.155 uvm_km.c
--- uvm/uvm_km.c 1 Nov 2024 20:26:18 -0000 1.155
+++ uvm/uvm_km.c 4 Mar 2025 13:02:35 -0000
@@ -270,9 +270,7 @@ uvm_km_pgremove(struct uvm_object *uobj,
slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT);
if (pp != NULL) {
- uvm_lock_pageq();
uvm_pagefree(pp);
- uvm_unlock_pageq();
} else if (slot != 0) {
swpgonlydelta++;
}
Index: uvm/uvm_map.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
diff -u -p -r1.338 uvm_map.c
--- uvm/uvm_map.c 29 Jan 2025 15:25:31 -0000 1.338
+++ uvm/uvm_map.c 4 Mar 2025 13:02:35 -0000
@@ -4517,16 +4517,8 @@ uvm_map_clean(struct vm_map *map, vaddr_
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
- /* skip the page if it's wired */
- if (pg->wire_count != 0)
- break;
-
- uvm_lock_pageq();
-
KASSERT(pg->uanon == anon);
uvm_pagedeactivate(pg);
-
- uvm_unlock_pageq();
break;
case PGO_FREE:
/*
Index: uvm/uvm_object.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_object.c,v
diff -u -p -r1.26 uvm_object.c
--- uvm/uvm_object.c 19 Feb 2025 11:10:54 -0000 1.26
+++ uvm/uvm_object.c 4 Mar 2025 13:02:35 -0000
@@ -161,13 +161,11 @@ uvm_obj_wire(struct uvm_object *uobj, vo
}
/* Wire the pages */
- uvm_lock_pageq();
for (i = 0; i < npages; i++) {
uvm_pagewire(pgs[i]);
if (pageq != NULL)
TAILQ_INSERT_TAIL(pageq, pgs[i], pageq);
}
- uvm_unlock_pageq();
/* Unbusy the pages */
uvm_page_unbusy(pgs, npages);
@@ -198,7 +196,6 @@ uvm_obj_unwire(struct uvm_object *uobj,
off_t offset;
rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK);
- uvm_lock_pageq();
for (offset = start; offset < end; offset += PAGE_SIZE) {
pg = uvm_pagelookup(uobj, offset);
@@ -207,7 +204,6 @@ uvm_obj_unwire(struct uvm_object *uobj,
uvm_pageunwire(pg);
}
- uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
}
#endif /* !SMALL_KERNEL */
@@ -238,9 +234,7 @@ uvm_obj_free(struct uvm_object *uobj)
*/
atomic_clearbits_int(&pg->pg_flags, PG_TABLED);
pg->uobject = NULL;
- uvm_lock_pageq();
uvm_pageclean(pg);
- uvm_unlock_pageq();
TAILQ_INSERT_TAIL(&pgl, pg, pageq);
}
uvm_pglistfree(&pgl);
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
diff -u -p -r1.181 uvm_page.c
--- uvm/uvm_page.c 19 Feb 2025 11:10:54 -0000 1.181
+++ uvm/uvm_page.c 4 Mar 2025 13:02:35 -0000
@@ -863,9 +863,7 @@ uvm_pagerealloc_multi(struct uvm_object
uvm_pagecopy(tpg, pg);
KASSERT(tpg->wire_count == 1);
tpg->wire_count = 0;
- uvm_lock_pageq();
uvm_pagefree(tpg);
- uvm_unlock_pageq();
uvm_pagealloc_pg(pg, obj, offset, NULL);
}
}
@@ -947,7 +945,6 @@ uvm_pagerealloc(struct vm_page *pg, stru
* uvm_pageclean: clean page
*
* => erase page's identity (i.e. remove from object)
- * => caller must lock page queues if `pg' is managed
* => assumes all valid mappings of pg are gone
*/
void
@@ -955,10 +952,6 @@ uvm_pageclean(struct vm_page *pg)
{
u_int flags_to_clear = 0;
- if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) &&
- (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject)))
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
-
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
pg->uanon == (void *)0xdeadbeef) {
@@ -982,14 +975,18 @@ uvm_pageclean(struct vm_page *pg)
/*
* now remove the page from the queues
*/
- uvm_pagedequeue(pg);
+ if (pg->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) {
+ uvm_lock_pageq();
+ uvm_pagedequeue(pg);
+ uvm_unlock_pageq();
+ }
/*
* if the page was wired, unwire it now.
*/
if (pg->wire_count) {
pg->wire_count = 0;
- uvmexp.wired--;
+ atomic_dec_int(&uvmexp.wired);
}
if (pg->uanon) {
pg->uanon->an_page = NULL;
@@ -1231,11 +1228,12 @@ void
uvm_pagewire(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, TRUE));
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
if (pg->wire_count == 0) {
+ uvm_lock_pageq();
uvm_pagedequeue(pg);
- uvmexp.wired++;
+ uvm_unlock_pageq();
+ atomic_inc_int(&uvmexp.wired);
}
pg->wire_count++;
}
@@ -1250,12 +1248,11 @@ void
uvm_pageunwire(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, TRUE));
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
pg->wire_count--;
if (pg->wire_count == 0) {
uvm_pageactivate(pg);
- uvmexp.wired--;
+ atomic_dec_int(&uvmexp.wired);
}
}
@@ -1270,51 +1267,62 @@ void
uvm_pagedeactivate(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, FALSE));
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
+
+ if (pg->wire_count > 0) {
+ KASSERT((pg->pg_flags & (PQ_INACTIVE|PQ_ACTIVE)) == 0);
+ return;
+ }
+
+ if (pg->pg_flags & PQ_INACTIVE)
+ return;
pmap_page_protect(pg, PROT_NONE);
+ uvm_lock_pageq();
+ if (pg->pg_flags & PQ_INACTIVE) {
+ uvm_unlock_pageq();
+ return;
+ }
if (pg->pg_flags & PQ_ACTIVE) {
TAILQ_REMOVE(&uvm.page_active, pg, pageq);
atomic_clearbits_int(&pg->pg_flags, PQ_ACTIVE);
uvmexp.active--;
}
- if ((pg->pg_flags & PQ_INACTIVE) == 0) {
- KASSERT(pg->wire_count == 0);
- TAILQ_INSERT_TAIL(&uvm.page_inactive, pg, pageq);
- atomic_setbits_int(&pg->pg_flags, PQ_INACTIVE);
- uvmexp.inactive++;
- pmap_clear_reference(pg);
- /*
- * update the "clean" bit. this isn't 100%
- * accurate, and doesn't have to be. we'll
- * re-sync it after we zap all mappings when
- * scanning the inactive list.
- */
- if ((pg->pg_flags & PG_CLEAN) != 0 &&
- pmap_is_modified(pg))
- atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
- }
+ TAILQ_INSERT_TAIL(&uvm.page_inactive, pg, pageq);
+ atomic_setbits_int(&pg->pg_flags, PQ_INACTIVE);
+ uvmexp.inactive++;
+ uvm_unlock_pageq();
+
+ pmap_clear_reference(pg);
+ /*
+ * update the "clean" bit. this isn't 100%
+ * accurate, and doesn't have to be. we'll
+ * re-sync it after we zap all mappings when
+ * scanning the inactive list.
+ */
+ if ((pg->pg_flags & PG_CLEAN) != 0 && pmap_is_modified(pg))
+ atomic_clearbits_int(&pg->pg_flags, PG_CLEAN);
}
/*
* uvm_pageactivate: activate page
- *
- * => caller must lock page queues
*/
void
uvm_pageactivate(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, FALSE));
- MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
-
- uvm_pagedequeue(pg);
- if (pg->wire_count == 0) {
- TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq);
- atomic_setbits_int(&pg->pg_flags, PQ_ACTIVE);
- uvmexp.active++;
+ if (pg->wire_count > 0) {
+ KASSERT((pg->pg_flags & (PQ_INACTIVE|PQ_ACTIVE)) == 0);
+ return;
}
+
+ uvm_lock_pageq();
+ uvm_pagedequeue(pg);
+ TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq);
+ atomic_setbits_int(&pg->pg_flags, PQ_ACTIVE);
+ uvmexp.active++;
+ uvm_unlock_pageq();
}
/*
@@ -1369,7 +1377,9 @@ uvm_page_owner_locked_p(struct vm_page *
: rw_lock_held(pg->uobject->vmobjlock);
}
if (pg->uanon != NULL) {
- return rw_write_held(pg->uanon->an_lock);
+ return exclusive
+ ? rw_write_held(pg->uanon->an_lock)
+ : rw_lock_held(pg->uanon->an_lock);
}
return 1;
}
Index: uvm/uvm_page.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.h,v
diff -u -p -r1.72 uvm_page.h
--- uvm/uvm_page.h 19 Feb 2025 11:07:47 -0000 1.72
+++ uvm/uvm_page.h 4 Mar 2025 13:02:35 -0000
@@ -147,13 +147,12 @@ struct vm_page {
#define PG_RDONLY 0x00000080 /* page must be mapped read-only */
#define PG_ZERO 0x00000100 /* page is pre-zero'd */
#define PG_DEV 0x00000200 /* page is in device space, lay off */
-
-#define PG_PAGER1 0x00001000 /* pager-specific flag */
#define PG_MASK 0x0000ffff
#define PQ_FREE 0x00010000 /* page is on free list */
#define PQ_INACTIVE 0x00020000 /* page is in inactive list */
#define PQ_ACTIVE 0x00040000 /* page is in active list */
+#define PQ_ITER 0x00080000 /* page is an iterator marker */
#define PQ_ANON 0x00100000 /* page is part of an anon, rather
than an uvm_object */
#define PQ_AOBJ 0x00200000 /* page is part of an anonymous
Index: uvm/uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
diff -u -p -r1.93 uvm_pager.c
--- uvm/uvm_pager.c 25 Nov 2024 12:51:00 -0000 1.93
+++ uvm/uvm_pager.c 4 Mar 2025 13:02:35 -0000
@@ -761,7 +761,6 @@ uvm_aio_aiodone_pages(struct vm_page **p
anon_disposed = (pg->pg_flags & PG_RELEASED) != 0;
KASSERT(!anon_disposed || pg->uobject != NULL ||
pg->uanon->an_ref == 0);
- uvm_lock_pageq();
/*
* if this was a successful write,
@@ -777,11 +776,9 @@ uvm_aio_aiodone_pages(struct vm_page **p
* unlock everything for this page now.
*/
if (pg->uobject == NULL && anon_disposed) {
- uvm_unlock_pageq();
uvm_anon_release(pg->uanon);
} else {
uvm_page_unbusy(&pg, 1);
- uvm_unlock_pageq();
rw_exit(slock);
}
}
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
diff -u -p -r1.134 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 25 Jan 2025 08:55:52 -0000 1.134
+++ uvm/uvm_pdaemon.c 4 Mar 2025 13:02:35 -0000
@@ -453,6 +453,29 @@ uvmpd_match_constraint(struct vm_page *p
return 0;
}
+struct vm_page *
+uvmpd_iterator(struct pglist *pglst, struct vm_page *p, struct vm_page *iter)
+{
+ struct vm_page *nextpg = NULL;
+
+ MUTEX_ASSERT_LOCKED(&uvm.pageqlock);
+
+ /* p is null to signal final swap i/o. */
+ if (p == NULL)
+ return NULL;
+
+ do {
+ nextpg = TAILQ_NEXT(iter, pageq);
+ } while (nextpg && (nextpg->pg_flags & PQ_ITER));
+
+ if (nextpg) {
+ TAILQ_REMOVE(pglst, iter, pageq);
+ TAILQ_INSERT_AFTER(pglst, nextpg, iter, pageq);
+ }
+
+ return nextpg;
+}
+
/*
* uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
*
@@ -467,7 +490,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
{
struct pglist *pglst = &uvm.page_inactive;
int result, freed = 0;
- struct vm_page *p, *nextpg;
+ struct vm_page *p, iter = { .pg_flags = PQ_ITER };
struct uvm_object *uobj;
struct vm_page *pps[SWCLUSTPAGES], **ppsp;
int npages;
@@ -501,7 +524,12 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
break;
}
- for (; p != NULL || swslot != 0; p = nextpg) {
+ if (p == NULL)
+ return 0;
+
+ /* Insert iterator. */
+ TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
+ for (; p != NULL || swslot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
/*
* note that p can be NULL iff we have traversed the whole
* list and need to do one final swap-backed clustered pageout.
@@ -522,7 +550,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
/* set p to null to signal final swap i/o */
p = NULL;
- nextpg = NULL;
}
}
if (p) { /* if (we have a new page to consider) */
@@ -530,7 +557,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
* we are below target and have a new page to consider.
*/
uvmexp.pdscans++;
- nextpg = TAILQ_NEXT(p, pageq);
/*
* If we are not short on memory and only interested
@@ -563,8 +589,10 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
* and skip to next page.
*/
if (pmap_is_referenced(p)) {
+ uvm_unlock_pageq();
uvm_pageactivate(p);
rw_exit(slock);
+ uvm_lock_pageq();
uvmexp.pdreact++;
continue;
}
@@ -596,6 +624,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
/* zap all mappings with pmap_page_protect... */
pmap_page_protect(p, PROT_NONE);
+ /* dequeue first to prevent lock recursion */
+ uvm_pagedequeue(p);
uvm_pagefree(p);
freed++;
@@ -633,8 +663,10 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
*/
if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
dirtyreacts++;
+ uvm_unlock_pageq();
uvm_pageactivate(p);
rw_exit(slock);
+ uvm_lock_pageq();
continue;
}
@@ -772,26 +804,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
* async I/O is in progress and the async I/O done routine
* will clean up after us. in this case we move on to the
* next page.
- *
- * there is a very remote chance that the pending async i/o can
- * finish _before_ we get here. if that happens, our page "p"
- * may no longer be on the inactive queue. so we verify this
- * when determining the next page (starting over at the head if
- * we've lost our inactive page).
*/
-
if (result == VM_PAGER_PEND) {
atomic_add_int(&uvmexp.paging, npages);
uvm_lock_pageq();
uvmexp.pdpending++;
- if (p) {
- if (p->pg_flags & PQ_INACTIVE)
- nextpg = TAILQ_NEXT(p, pageq);
- else
- nextpg = TAILQ_FIRST(pglst);
- } else {
- nextpg = NULL;
- }
continue;
}
@@ -851,13 +868,9 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
uvm_anfree(anon); /* kills anon */
pmap_page_protect(p, PROT_NONE);
anon = NULL;
- uvm_lock_pageq();
- nextpg = TAILQ_NEXT(p, pageq);
/* free released page */
uvm_pagefree(p);
} else { /* page was not released during I/O */
- uvm_lock_pageq();
- nextpg = TAILQ_NEXT(p, pageq);
if (result != VM_PAGER_OK) {
/* pageout was a failure... */
if (result != VM_PAGER_AGAIN)
@@ -871,33 +884,15 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
PG_CLEAN);
}
}
-
- /*
- * drop object lock (if there is an object left). do
- * a safety check of nextpg to make sure it is on the
- * inactive queue (it should be since PG_BUSY pages on
- * the inactive queue can't be re-queued [note: not
- * true for active queue]).
- */
rw_exit(slock);
-
- if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) {
- nextpg = TAILQ_FIRST(pglst); /* reload! */
- }
- } else {
- /*
- * if p is null in this loop, make sure it stays null
- * in the next loop.
- */
- nextpg = NULL;
-
- /*
- * lock page queues here just so they're always locked
- * at the end of the loop.
- */
- uvm_lock_pageq();
}
+ /*
+ * lock page queues here just so they're always locked
+ * at the end of the loop.
+ */
+ uvm_lock_pageq();
}
+ TAILQ_REMOVE(pglst, &iter, pageq);
return freed;
}
@@ -1019,8 +1014,9 @@ uvmpd_scan_active(struct uvm_pmalloc *pm
* inactive pages.
*/
if (inactive_shortage > 0) {
- /* no need to check wire_count as pg is "active" */
+ uvm_unlock_pageq();
uvm_pagedeactivate(p);
+ uvm_lock_pageq();
uvmexp.pddeact++;
inactive_shortage--;
}
@@ -1055,7 +1051,6 @@ uvmpd_drop(struct pglist *pglst)
struct uvm_object * uobj = p->uobject;
rw_enter(uobj->vmobjlock, RW_WRITE);
- uvm_lock_pageq();
/*
* we now have the page queues locked.
* the page is not busy. if the page is clean we
@@ -1071,7 +1066,6 @@ uvmpd_drop(struct pglist *pglst)
pmap_page_protect(p, PROT_NONE);
uvm_pagefree(p);
}
- uvm_unlock_pageq();
rw_exit(uobj->vmobjlock);
}
}
Index: uvm/uvm_swap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_swap.c,v
diff -u -p -r1.173 uvm_swap.c
--- uvm/uvm_swap.c 7 Nov 2024 09:04:55 -0000 1.173
+++ uvm/uvm_swap.c 4 Mar 2025 13:02:35 -0000
@@ -395,10 +395,8 @@ uvm_swap_freepages(struct vm_page **pps,
return;
}
- uvm_lock_pageq();
for (i = 0; i < npages; i++)
uvm_pagefree(pps[i]);
- uvm_unlock_pageq();
}
Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
diff -u -p -r1.138 uvm_vnode.c
--- uvm/uvm_vnode.c 27 Dec 2024 12:04:40 -0000 1.138
+++ uvm/uvm_vnode.c 4 Mar 2025 13:02:35 -0000
@@ -602,13 +602,11 @@ uvn_flush(struct uvm_object *uobj, voff_
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
struct vm_page *pp, *ptmp;
struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
- struct pglist dead;
int npages, result, lcv;
boolean_t retval, need_iosync, needs_clean;
voff_t curoff;
KASSERT(rw_write_held(uobj->vmobjlock));
- TAILQ_INIT(&dead);
/* get init vals and determine how we are going to traverse object */
need_iosync = FALSE;
@@ -682,9 +680,9 @@ uvn_flush(struct uvm_object *uobj, voff_
/* if we don't need a clean, deactivate/free pages then cont. */
if (!needs_clean) {
if (flags & PGO_DEACTIVATE) {
- if (pp->wire_count == 0) {
- uvm_pagedeactivate(pp);
- }
+ uvm_unlock_pageq();
+ uvm_pagedeactivate(pp);
+ uvm_lock_pageq();
} else if (flags & PGO_FREE) {
if (pp->pg_flags & PG_BUSY) {
uvm_unlock_pageq();
@@ -696,9 +694,9 @@ uvn_flush(struct uvm_object *uobj, voff_
continue;
} else {
pmap_page_protect(pp, PROT_NONE);
- /* removed page from object */
- uvm_pageclean(pp);
- TAILQ_INSERT_HEAD(&dead, pp, pageq);
+ /* dequeue to prevent lock recursion */
+ uvm_pagedequeue(pp);
+ uvm_pagefree(pp);
}
}
continue;
@@ -807,9 +805,9 @@ ReTry:
/* dispose of page */
if (flags & PGO_DEACTIVATE) {
- if (ptmp->wire_count == 0) {
- uvm_pagedeactivate(ptmp);
- }
+ uvm_unlock_pageq();
+ uvm_pagedeactivate(ptmp);
+ uvm_lock_pageq();
} else if (flags & PGO_FREE &&
result != VM_PAGER_PEND) {
if (result != VM_PAGER_OK) {
@@ -830,8 +828,9 @@ ReTry:
retval = FALSE;
}
pmap_page_protect(ptmp, PROT_NONE);
- uvm_pageclean(ptmp);
- TAILQ_INSERT_TAIL(&dead, ptmp, pageq);
+ /* dequeue first to prevent lock recursion */
+ uvm_pagedequeue(ptmp);
+ uvm_pagefree(ptmp);
}
} /* end of "lcv" for loop */
@@ -852,8 +851,6 @@ ReTry:
wakeup(&uvn->u_flags);
uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
}
-
- uvm_pglistfree(&dead);
return retval;
}
Index: uvm/uvmexp.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
diff -u -p -r1.17 uvmexp.h
--- uvm/uvmexp.h 25 Feb 2025 11:29:17 -0000 1.17
+++ uvm/uvmexp.h 4 Mar 2025 13:02:35 -0000
@@ -61,7 +61,7 @@ struct uvmexp {
int active; /* [L] # of active pages */
int inactive; /* [L] # of pages that we free'd but may want back */
int paging; /* [a] # of pages in the process of being paged out */
- int wired; /* number of wired pages */
+ int wired; /* [a] number of wired pages */
int zeropages; /* [F] number of zero'd pages */
int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */
UVM performance improvements