From: Martin Pieuchot Subject: UVM performance improvements To: tech@openbsd.org Cc: robert@openbsd.org Date: Tue, 4 Mar 2025 14:25:44 +0100 Diff below includes all the changes I made during the last months to get rid of lock contention and enable parallel fault handling. With it I measured a 5% performance improvement and a reduction of %sys time of 25% on 24CPU amd64. On my 80CPU arm64 performances also improve by 5% and %sys time increase by ~5%. I'd be happy to hear about more tests, especially on bulk builds and on other architectures. Thanks for your help, Martin Index: uvm/uvm_amap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_amap.c,v diff -u -p -r1.96 uvm_amap.c --- uvm/uvm_amap.c 4 Dec 2024 09:19:11 -0000 1.96 +++ uvm/uvm_amap.c 4 Mar 2025 13:02:35 -0000 @@ -823,9 +823,7 @@ ReStart: */ atomic_clearbits_int(&npg->pg_flags, PG_BUSY|PG_FAKE); UVM_PAGE_OWN(npg, NULL); - uvm_lock_pageq(); uvm_pageactivate(npg); - uvm_unlock_pageq(); } } amap_unlock(amap); Index: uvm/uvm_anon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_anon.c,v diff -u -p -r1.61 uvm_anon.c --- uvm/uvm_anon.c 27 Dec 2024 12:04:40 -0000 1.61 +++ uvm/uvm_anon.c 4 Mar 2025 13:02:35 -0000 @@ -106,14 +106,10 @@ uvm_anfree_list(struct vm_anon *anon, st * clean page, and put it on pglist * for later freeing. */ - uvm_lock_pageq(); uvm_pageclean(pg); - uvm_unlock_pageq(); TAILQ_INSERT_HEAD(pgl, pg, pageq); } else { - uvm_lock_pageq(); /* lock out pagedaemon */ uvm_pagefree(pg); /* bye bye */ - uvm_unlock_pageq(); /* free the daemon */ } } else { if (anon->an_swslot != 0 && anon->an_swslot != SWSLOT_BAD) { @@ -181,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st * anon was freed. */ return FALSE; + case ENOLCK: + /* Should not be possible. */ default: #ifdef DIAGNOSTIC panic("anon_pagein: uvmfault_anonget -> %d", rv); @@ -202,9 +200,7 @@ uvm_anon_pagein(struct vm_amap *amap, st /* * Deactivate the page (to put it on a page queue). */ - uvm_lock_pageq(); uvm_pagedeactivate(pg); - uvm_unlock_pageq(); rw_exit(anon->an_lock); return FALSE; @@ -249,10 +245,8 @@ uvm_anon_release(struct vm_anon *anon) KASSERT(pg->uanon == anon); KASSERT(anon->an_ref == 0); - uvm_lock_pageq(); pmap_page_protect(pg, PROT_NONE); uvm_pagefree(pg); - uvm_unlock_pageq(); KASSERT(anon->an_page == NULL); lock = anon->an_lock; uvm_anon_dropswap(anon); Index: uvm/uvm_aobj.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v diff -u -p -r1.115 uvm_aobj.c --- uvm/uvm_aobj.c 27 Dec 2024 12:04:40 -0000 1.115 +++ uvm/uvm_aobj.c 4 Mar 2025 13:02:35 -0000 @@ -839,9 +839,7 @@ uao_detach(struct uvm_object *uobj) continue; } uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT); - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); } /* @@ -921,18 +919,10 @@ uao_flush(struct uvm_object *uobj, voff_ * XXX in the future. */ case PGO_CLEANIT|PGO_FREE: - /* FALLTHROUGH */ case PGO_CLEANIT|PGO_DEACTIVATE: - /* FALLTHROUGH */ case PGO_DEACTIVATE: deactivate_it: - if (pg->wire_count != 0) - continue; - - uvm_lock_pageq(); uvm_pagedeactivate(pg); - uvm_unlock_pageq(); - continue; case PGO_FREE: /* @@ -957,10 +947,7 @@ uao_flush(struct uvm_object *uobj, voff_ * because we need to update swap accounting anyway. */ uao_dropswap(uobj, pg->offset >> PAGE_SHIFT); - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); - continue; default: panic("uao_flush: weird flags"); @@ -1179,9 +1166,7 @@ uao_get(struct uvm_object *uobj, voff_t atomic_clearbits_int(&ptmp->pg_flags, PG_WANTED|PG_BUSY); UVM_PAGE_OWN(ptmp, NULL); - uvm_lock_pageq(); uvm_pagefree(ptmp); - uvm_unlock_pageq(); rw_exit(uobj->vmobjlock); return rv; @@ -1410,9 +1395,7 @@ uao_pagein_page(struct uvm_aobj *aobj, i /* * deactivate the page (to put it on a page queue). */ - uvm_lock_pageq(); uvm_pagedeactivate(pg); - uvm_unlock_pageq(); return FALSE; } Index: uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v diff -u -p -r1.164 uvm_fault.c --- uvm/uvm_fault.c 25 Feb 2025 11:29:17 -0000 1.164 +++ uvm/uvm_fault.c 4 Mar 2025 13:03:16 -0000 @@ -43,6 +43,8 @@ #include +int pfault = 0; /* resolve fault in parallel */ + /* * * a word on page faults: @@ -183,11 +185,7 @@ uvmfault_anonflush(struct vm_anon **anon KASSERT(rw_lock_held(anons[lcv]->an_lock)); pg = anons[lcv]->an_page; if (pg && (pg->pg_flags & PG_BUSY) == 0) { - uvm_lock_pageq(); - if (pg->wire_count == 0) { - uvm_pagedeactivate(pg); - } - uvm_unlock_pageq(); + uvm_pagedeactivate(pg); } } } @@ -277,6 +275,7 @@ uvmfault_anonget(struct uvm_faultinfo *u struct vm_anon *anon) { struct vm_page *pg; + int lock_type; int error; KASSERT(rw_lock_held(anon->an_lock)); @@ -305,6 +304,7 @@ uvmfault_anonget(struct uvm_faultinfo *u /* * Is page resident? Make sure it is not busy/released. */ + lock_type = rw_status(anon->an_lock); if (pg) { KASSERT(pg->pg_flags & PQ_ANON); KASSERT(pg->uanon == anon); @@ -326,8 +326,13 @@ uvmfault_anonget(struct uvm_faultinfo *u uvm_pagewait(pg, anon->an_lock, "anonget"); } else { /* - * No page, therefore allocate one. + * No page, therefore allocate one. A write lock is + * required for this. If the caller didn't supply + * one, fail now and have them retry. */ + if (lock_type == RW_READ) { + return ENOLCK; + } pg = uvm_pagealloc(NULL, 0, anon, 0); if (pg == NULL) { /* Out of memory. Wait a little. */ @@ -417,9 +422,7 @@ uvmfault_anonget(struct uvm_faultinfo *u * cannot be mapped and thus no need to * pmap_page_protect() it. */ - uvm_lock_pageq(); uvm_pagefree(pg); - uvm_unlock_pageq(); if (locked) { uvmfault_unlockall(ufi, NULL, NULL); @@ -437,9 +440,7 @@ uvmfault_anonget(struct uvm_faultinfo *u * We have successfully read the page, activate it. */ pmap_clear_modify(pg); - uvm_lock_pageq(); uvm_pageactivate(pg); - uvm_unlock_pageq(); atomic_clearbits_int(&pg->pg_flags, PG_WANTED|PG_BUSY|PG_FAKE); UVM_PAGE_OWN(pg, NULL); @@ -500,6 +501,7 @@ uvmfault_promote(struct uvm_faultinfo *u if (uobjpage != PGO_DONTCARE) uobj = uobjpage->uobject; + KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); anon = uvm_analloc(); @@ -611,6 +613,7 @@ struct uvm_faultctx { boolean_t wired; paddr_t pa_flags; boolean_t promote; + int upper_lock_type; int lower_lock_type; }; @@ -655,11 +658,14 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad flt.access_type = access_type; flt.narrow = FALSE; /* assume normal fault for now */ flt.wired = FALSE; /* assume non-wired fault for now */ -#if notyet - flt.lower_lock_type = RW_READ; /* shared lock for now */ -#else - flt.lower_lock_type = RW_WRITE; /* exclusive lock for now */ -#endif + if (pfault) { + /* shared lock for now */ + flt.upper_lock_type = RW_READ; + flt.lower_lock_type = RW_READ; + } else { + flt.upper_lock_type = RW_WRITE; + flt.lower_lock_type = RW_WRITE; + } error = ERESTART; while (error == ERESTART) { /* ReFault: */ @@ -842,7 +848,13 @@ uvm_fault_check(struct uvm_faultinfo *uf * if we've got an amap then lock it and extract current anons. */ if (amap) { - amap_lock(amap, RW_WRITE); + if ((flt->access_type & PROT_WRITE) != 0) { + /* + * assume we're about to COW. + */ + flt->upper_lock_type = RW_WRITE; + } + amap_lock(amap, flt->upper_lock_type); amap_lookups(&ufi->entry->aref, flt->startva - ufi->entry->start, *ranons, flt->npages); } else { @@ -894,6 +906,36 @@ uvm_fault_check(struct uvm_faultinfo *uf } /* + * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer + */ +static inline int +uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap) +{ + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); + + /* + * fast path. + */ + if (flt->upper_lock_type == RW_WRITE) { + return 0; + } + + /* + * otherwise try for the upgrade. if we don't get it, unlock + * everything, restart the fault and next time around get a writer + * lock. + */ + flt->upper_lock_type = RW_WRITE; + if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) { + counters_inc(uvmexp_counters, flt_noup); + return ERESTART; + } + counters_inc(uvmexp_counters, flt_up); + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); + return 0; +} + +/* * uvm_fault_upper_lookup: look up existing h/w mapping and amap. * * iterate range of interest: @@ -916,9 +958,8 @@ uvm_fault_upper_lookup(struct uvm_faulti paddr_t pa; int lcv, entered = 0; - /* locked: maps(read), amap(if there) */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); /* * map in the backpages and frontpages we found in the amap in hopes @@ -956,9 +997,7 @@ uvm_fault_upper_lookup(struct uvm_faulti */ if (pg && (pg->pg_flags & (PG_RELEASED|PG_BUSY)) == 0 && !pmap_extract(ufi->orig_map->pmap, currva, &pa)) { - uvm_lock_pageq(); uvm_pageactivate(pg); /* reactivate */ - uvm_unlock_pageq(); counters_inc(uvmexp_counters, flt_namap); /* No fault-ahead when wired. */ @@ -1000,8 +1039,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf struct vm_page *pg = NULL; int error, ret; - /* locked: maps(read), amap, anon */ - KASSERT(rw_write_held(amap->am_lock)); + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); /* @@ -1014,6 +1052,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf * if it succeeds, locks are still valid and locked. * also, if it is OK, then the anon's page is on the queues. */ +retry: error = uvmfault_anonget(ufi, amap, anon); switch (error) { case 0: @@ -1022,11 +1061,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf case ERESTART: return ERESTART; + case ENOLCK: + /* it needs a write lock: retry */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, NULL); + return error; + } + KASSERT(rw_write_held(amap->am_lock)); + goto retry; + default: return error; } - KASSERT(rw_write_held(amap->am_lock)); + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); /* @@ -1041,9 +1090,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf * * if we are out of anon VM we wait for RAM to become available. */ - if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { /* promoting requires a write lock. */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, NULL); + return error; + } KASSERT(rw_write_held(amap->am_lock)); counters_inc(uvmexp_counters, flt_acow); @@ -1066,6 +1119,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf KASSERT(oanon->an_ref > 1); oanon->an_ref--; + /* + * note: oanon is still locked, as is the new anon. we + * need to check for this later when we unlock oanon; if + * oanon != anon, we'll have to unlock anon, too. + */ + KASSERT(anon->an_lock == amap->am_lock); + KASSERT(oanon->an_lock == amap->am_lock); + #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW) /* * If there are multiple threads, either uvm or the @@ -1080,12 +1141,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf flt->access_type &= ~PROT_WRITE; } #endif - - /* - * note: anon is _not_ locked, but we have the sole references - * to in from amap. - * thus, no one can get at it until we are done with it. - */ } else { counters_inc(uvmexp_counters, flt_anon); oanon = anon; @@ -1124,13 +1179,11 @@ uvm_fault_upper(struct uvm_faultinfo *uf /* * ... update the page queues. */ - uvm_lock_pageq(); if (flt->wired) { uvm_pagewire(pg); } else { uvm_pageactivate(pg); } - uvm_unlock_pageq(); if (flt->wired) { /* @@ -1215,11 +1268,7 @@ uvm_fault_lower_lookup( * are neither busy nor released, so we don't need to check * for this. we can just directly enter the pages. */ - if (pages[lcv]->wire_count == 0) { - uvm_lock_pageq(); - uvm_pageactivate(pages[lcv]); - uvm_unlock_pageq(); - } + uvm_pageactivate(pages[lcv]); counters_inc(uvmexp_counters, flt_nomap); /* No fault-ahead when wired. */ @@ -1248,10 +1297,8 @@ uvm_fault_lower_lookup( * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer */ static inline int -uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, - struct vm_amap *amap, struct uvm_object *uobj) +uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj) { - KASSERT(uobj != NULL); KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock)); /* @@ -1267,7 +1314,6 @@ uvm_fault_lower_upgrade(struct uvm_fault */ flt->lower_lock_type = RW_WRITE; if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) { - uvmfault_unlockall(ufi, amap, uobj); counters_inc(uvmexp_counters, flt_noup); return ERESTART; } @@ -1321,11 +1367,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf * made it BUSY. */ - /* - * locked: - */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); @@ -1356,9 +1399,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf /* update rusage counters */ curproc->p_ru.ru_minflt++; if (uobjpage != PGO_DONTCARE) { - uvm_lock_pageq(); uvm_pageactivate(uobjpage); - uvm_unlock_pageq(); } } else { error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage); @@ -1394,6 +1435,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf KASSERT(amap != NULL); /* promoting requires a write lock. */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, uobj); + return error; + } KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); @@ -1470,7 +1516,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf * Note: pg is either the uobjpage or the new page in the new anon. */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); KASSERT(anon == NULL || anon->an_lock == amap->am_lock); @@ -1508,7 +1554,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf return ERESTART; } - uvm_lock_pageq(); if (flt->wired) { uvm_pagewire(pg); if (pg->pg_flags & PQ_AOBJ) { @@ -1530,7 +1575,6 @@ uvm_fault_lower(struct uvm_faultinfo *uf } else { uvm_pageactivate(pg); } - uvm_unlock_pageq(); if (dropswap) uao_dropswap(uobj, pg->offset >> PAGE_SHIFT); @@ -1574,9 +1618,11 @@ uvm_fault_lower_io( advice = ufi->entry->advice; /* Upgrade to a write lock if needed. */ - error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj); - if (error != 0) + error = uvm_fault_lower_upgrade(flt, uobj); + if (error != 0) { + uvmfault_unlockall(ufi, amap, uobj); return error; + } uvmfault_unlockall(ufi, amap, NULL); /* update rusage counters */ @@ -1612,7 +1658,7 @@ uvm_fault_lower_io( /* re-verify the state of the world. */ locked = uvmfault_relock(ufi); if (locked && amap != NULL) - amap_lock(amap, RW_WRITE); + amap_lock(amap, flt->upper_lock_type); /* might be changed */ if (pg != PGO_DONTCARE) { @@ -1635,9 +1681,7 @@ uvm_fault_lower_io( /* release the page now, still holding object lock */ if (pg != PGO_DONTCARE) { - uvm_lock_pageq(); uvm_pageactivate(pg); - uvm_unlock_pageq(); if (pg->pg_flags & PG_WANTED) wakeup(pg); @@ -1739,16 +1783,12 @@ uvm_fault_unwire_locked(vm_map_t map, va * find the map entry for the current address. */ KASSERT(va >= entry->start); - while (entry && va >= entry->end) { + while (va >= entry->end) { next = RBT_NEXT(uvm_map_addr, entry); + KASSERT(next != NULL && next->start <= entry->end); entry = next; } - if (entry == NULL) - return; - if (va < entry->start) - continue; - /* * lock it. */ @@ -1771,14 +1811,12 @@ uvm_fault_unwire_locked(vm_map_t map, va pg = PHYS_TO_VM_PAGE(pa); if (pg) { - uvm_lock_pageq(); uvm_pageunwire(pg); - uvm_unlock_pageq(); } } if (oentry != NULL) { - uvm_map_unlock_entry(oentry); + uvm_map_unlock_entry(entry); } } Index: uvm/uvm_glue.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_glue.c,v diff -u -p -r1.87 uvm_glue.c --- uvm/uvm_glue.c 28 Oct 2024 08:25:32 -0000 1.87 +++ uvm/uvm_glue.c 4 Mar 2025 13:02:35 -0000 @@ -114,7 +114,7 @@ uvm_vslock(struct proc *p, caddr_t addr, if (end <= start) return (EINVAL); - return uvm_fault_wire(map, start, end, access_type); + return uvm_map_pageable(map, start, end, FALSE, 0); } /* @@ -125,13 +125,14 @@ uvm_vslock(struct proc *p, caddr_t addr, void uvm_vsunlock(struct proc *p, caddr_t addr, size_t len) { + struct vm_map *map = &p->p_vmspace->vm_map; vaddr_t start, end; start = trunc_page((vaddr_t)addr); end = round_page((vaddr_t)addr + len); KASSERT(end > start); - uvm_fault_unwire(&p->p_vmspace->vm_map, start, end); + uvm_map_pageable(map, start, end, TRUE, 0); } /* Index: uvm/uvm_km.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_km.c,v diff -u -p -r1.155 uvm_km.c --- uvm/uvm_km.c 1 Nov 2024 20:26:18 -0000 1.155 +++ uvm/uvm_km.c 4 Mar 2025 13:02:35 -0000 @@ -270,9 +270,7 @@ uvm_km_pgremove(struct uvm_object *uobj, slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT); if (pp != NULL) { - uvm_lock_pageq(); uvm_pagefree(pp); - uvm_unlock_pageq(); } else if (slot != 0) { swpgonlydelta++; } Index: uvm/uvm_map.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.c,v diff -u -p -r1.338 uvm_map.c --- uvm/uvm_map.c 29 Jan 2025 15:25:31 -0000 1.338 +++ uvm/uvm_map.c 4 Mar 2025 13:02:35 -0000 @@ -4517,16 +4517,8 @@ uvm_map_clean(struct vm_map *map, vaddr_ case PGO_CLEANIT|PGO_DEACTIVATE: case PGO_DEACTIVATE: deactivate_it: - /* skip the page if it's wired */ - if (pg->wire_count != 0) - break; - - uvm_lock_pageq(); - KASSERT(pg->uanon == anon); uvm_pagedeactivate(pg); - - uvm_unlock_pageq(); break; case PGO_FREE: /* Index: uvm/uvm_object.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_object.c,v diff -u -p -r1.26 uvm_object.c --- uvm/uvm_object.c 19 Feb 2025 11:10:54 -0000 1.26 +++ uvm/uvm_object.c 4 Mar 2025 13:02:35 -0000 @@ -161,13 +161,11 @@ uvm_obj_wire(struct uvm_object *uobj, vo } /* Wire the pages */ - uvm_lock_pageq(); for (i = 0; i < npages; i++) { uvm_pagewire(pgs[i]); if (pageq != NULL) TAILQ_INSERT_TAIL(pageq, pgs[i], pageq); } - uvm_unlock_pageq(); /* Unbusy the pages */ uvm_page_unbusy(pgs, npages); @@ -198,7 +196,6 @@ uvm_obj_unwire(struct uvm_object *uobj, off_t offset; rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK); - uvm_lock_pageq(); for (offset = start; offset < end; offset += PAGE_SIZE) { pg = uvm_pagelookup(uobj, offset); @@ -207,7 +204,6 @@ uvm_obj_unwire(struct uvm_object *uobj, uvm_pageunwire(pg); } - uvm_unlock_pageq(); rw_exit(uobj->vmobjlock); } #endif /* !SMALL_KERNEL */ @@ -238,9 +234,7 @@ uvm_obj_free(struct uvm_object *uobj) */ atomic_clearbits_int(&pg->pg_flags, PG_TABLED); pg->uobject = NULL; - uvm_lock_pageq(); uvm_pageclean(pg); - uvm_unlock_pageq(); TAILQ_INSERT_TAIL(&pgl, pg, pageq); } uvm_pglistfree(&pgl); Index: uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v diff -u -p -r1.181 uvm_page.c --- uvm/uvm_page.c 19 Feb 2025 11:10:54 -0000 1.181 +++ uvm/uvm_page.c 4 Mar 2025 13:02:35 -0000 @@ -863,9 +863,7 @@ uvm_pagerealloc_multi(struct uvm_object uvm_pagecopy(tpg, pg); KASSERT(tpg->wire_count == 1); tpg->wire_count = 0; - uvm_lock_pageq(); uvm_pagefree(tpg); - uvm_unlock_pageq(); uvm_pagealloc_pg(pg, obj, offset, NULL); } } @@ -947,7 +945,6 @@ uvm_pagerealloc(struct vm_page *pg, stru * uvm_pageclean: clean page * * => erase page's identity (i.e. remove from object) - * => caller must lock page queues if `pg' is managed * => assumes all valid mappings of pg are gone */ void @@ -955,10 +952,6 @@ uvm_pageclean(struct vm_page *pg) { u_int flags_to_clear = 0; - if ((pg->pg_flags & (PG_TABLED|PQ_ACTIVE|PQ_INACTIVE)) && - (pg->uobject == NULL || !UVM_OBJ_IS_PMAP(pg->uobject))) - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); - #ifdef DEBUG if (pg->uobject == (void *)0xdeadbeef && pg->uanon == (void *)0xdeadbeef) { @@ -982,14 +975,18 @@ uvm_pageclean(struct vm_page *pg) /* * now remove the page from the queues */ - uvm_pagedequeue(pg); + if (pg->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) { + uvm_lock_pageq(); + uvm_pagedequeue(pg); + uvm_unlock_pageq(); + } /* * if the page was wired, unwire it now. */ if (pg->wire_count) { pg->wire_count = 0; - uvmexp.wired--; + atomic_dec_int(&uvmexp.wired); } if (pg->uanon) { pg->uanon->an_page = NULL; @@ -1231,11 +1228,12 @@ void uvm_pagewire(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, TRUE)); - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); if (pg->wire_count == 0) { + uvm_lock_pageq(); uvm_pagedequeue(pg); - uvmexp.wired++; + uvm_unlock_pageq(); + atomic_inc_int(&uvmexp.wired); } pg->wire_count++; } @@ -1250,12 +1248,11 @@ void uvm_pageunwire(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, TRUE)); - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); pg->wire_count--; if (pg->wire_count == 0) { uvm_pageactivate(pg); - uvmexp.wired--; + atomic_dec_int(&uvmexp.wired); } } @@ -1270,51 +1267,62 @@ void uvm_pagedeactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, FALSE)); - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); + + if (pg->wire_count > 0) { + KASSERT((pg->pg_flags & (PQ_INACTIVE|PQ_ACTIVE)) == 0); + return; + } + + if (pg->pg_flags & PQ_INACTIVE) + return; pmap_page_protect(pg, PROT_NONE); + uvm_lock_pageq(); + if (pg->pg_flags & PQ_INACTIVE) { + uvm_unlock_pageq(); + return; + } if (pg->pg_flags & PQ_ACTIVE) { TAILQ_REMOVE(&uvm.page_active, pg, pageq); atomic_clearbits_int(&pg->pg_flags, PQ_ACTIVE); uvmexp.active--; } - if ((pg->pg_flags & PQ_INACTIVE) == 0) { - KASSERT(pg->wire_count == 0); - TAILQ_INSERT_TAIL(&uvm.page_inactive, pg, pageq); - atomic_setbits_int(&pg->pg_flags, PQ_INACTIVE); - uvmexp.inactive++; - pmap_clear_reference(pg); - /* - * update the "clean" bit. this isn't 100% - * accurate, and doesn't have to be. we'll - * re-sync it after we zap all mappings when - * scanning the inactive list. - */ - if ((pg->pg_flags & PG_CLEAN) != 0 && - pmap_is_modified(pg)) - atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); - } + TAILQ_INSERT_TAIL(&uvm.page_inactive, pg, pageq); + atomic_setbits_int(&pg->pg_flags, PQ_INACTIVE); + uvmexp.inactive++; + uvm_unlock_pageq(); + + pmap_clear_reference(pg); + /* + * update the "clean" bit. this isn't 100% + * accurate, and doesn't have to be. we'll + * re-sync it after we zap all mappings when + * scanning the inactive list. + */ + if ((pg->pg_flags & PG_CLEAN) != 0 && pmap_is_modified(pg)) + atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); } /* * uvm_pageactivate: activate page - * - * => caller must lock page queues */ void uvm_pageactivate(struct vm_page *pg) { KASSERT(uvm_page_owner_locked_p(pg, FALSE)); - MUTEX_ASSERT_LOCKED(&uvm.pageqlock); - - uvm_pagedequeue(pg); - if (pg->wire_count == 0) { - TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq); - atomic_setbits_int(&pg->pg_flags, PQ_ACTIVE); - uvmexp.active++; + if (pg->wire_count > 0) { + KASSERT((pg->pg_flags & (PQ_INACTIVE|PQ_ACTIVE)) == 0); + return; } + + uvm_lock_pageq(); + uvm_pagedequeue(pg); + TAILQ_INSERT_TAIL(&uvm.page_active, pg, pageq); + atomic_setbits_int(&pg->pg_flags, PQ_ACTIVE); + uvmexp.active++; + uvm_unlock_pageq(); } /* @@ -1369,7 +1377,9 @@ uvm_page_owner_locked_p(struct vm_page * : rw_lock_held(pg->uobject->vmobjlock); } if (pg->uanon != NULL) { - return rw_write_held(pg->uanon->an_lock); + return exclusive + ? rw_write_held(pg->uanon->an_lock) + : rw_lock_held(pg->uanon->an_lock); } return 1; } Index: uvm/uvm_page.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.h,v diff -u -p -r1.72 uvm_page.h --- uvm/uvm_page.h 19 Feb 2025 11:07:47 -0000 1.72 +++ uvm/uvm_page.h 4 Mar 2025 13:02:35 -0000 @@ -147,13 +147,12 @@ struct vm_page { #define PG_RDONLY 0x00000080 /* page must be mapped read-only */ #define PG_ZERO 0x00000100 /* page is pre-zero'd */ #define PG_DEV 0x00000200 /* page is in device space, lay off */ - -#define PG_PAGER1 0x00001000 /* pager-specific flag */ #define PG_MASK 0x0000ffff #define PQ_FREE 0x00010000 /* page is on free list */ #define PQ_INACTIVE 0x00020000 /* page is in inactive list */ #define PQ_ACTIVE 0x00040000 /* page is in active list */ +#define PQ_ITER 0x00080000 /* page is an iterator marker */ #define PQ_ANON 0x00100000 /* page is part of an anon, rather than an uvm_object */ #define PQ_AOBJ 0x00200000 /* page is part of an anonymous Index: uvm/uvm_pager.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.c,v diff -u -p -r1.93 uvm_pager.c --- uvm/uvm_pager.c 25 Nov 2024 12:51:00 -0000 1.93 +++ uvm/uvm_pager.c 4 Mar 2025 13:02:35 -0000 @@ -761,7 +761,6 @@ uvm_aio_aiodone_pages(struct vm_page **p anon_disposed = (pg->pg_flags & PG_RELEASED) != 0; KASSERT(!anon_disposed || pg->uobject != NULL || pg->uanon->an_ref == 0); - uvm_lock_pageq(); /* * if this was a successful write, @@ -777,11 +776,9 @@ uvm_aio_aiodone_pages(struct vm_page **p * unlock everything for this page now. */ if (pg->uobject == NULL && anon_disposed) { - uvm_unlock_pageq(); uvm_anon_release(pg->uanon); } else { uvm_page_unbusy(&pg, 1); - uvm_unlock_pageq(); rw_exit(slock); } } Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -r1.134 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 25 Jan 2025 08:55:52 -0000 1.134 +++ uvm/uvm_pdaemon.c 4 Mar 2025 13:02:35 -0000 @@ -453,6 +453,29 @@ uvmpd_match_constraint(struct vm_page *p return 0; } +struct vm_page * +uvmpd_iterator(struct pglist *pglst, struct vm_page *p, struct vm_page *iter) +{ + struct vm_page *nextpg = NULL; + + MUTEX_ASSERT_LOCKED(&uvm.pageqlock); + + /* p is null to signal final swap i/o. */ + if (p == NULL) + return NULL; + + do { + nextpg = TAILQ_NEXT(iter, pageq); + } while (nextpg && (nextpg->pg_flags & PQ_ITER)); + + if (nextpg) { + TAILQ_REMOVE(pglst, iter, pageq); + TAILQ_INSERT_AFTER(pglst, nextpg, iter, pageq); + } + + return nextpg; +} + /* * uvmpd_scan_inactive: scan an inactive list for pages to clean or free. * @@ -467,7 +490,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * { struct pglist *pglst = &uvm.page_inactive; int result, freed = 0; - struct vm_page *p, *nextpg; + struct vm_page *p, iter = { .pg_flags = PQ_ITER }; struct uvm_object *uobj; struct vm_page *pps[SWCLUSTPAGES], **ppsp; int npages; @@ -501,7 +524,12 @@ uvmpd_scan_inactive(struct uvm_pmalloc * break; } - for (; p != NULL || swslot != 0; p = nextpg) { + if (p == NULL) + return 0; + + /* Insert iterator. */ + TAILQ_INSERT_AFTER(pglst, p, &iter, pageq); + for (; p != NULL || swslot != 0; p = uvmpd_iterator(pglst, p, &iter)) { /* * note that p can be NULL iff we have traversed the whole * list and need to do one final swap-backed clustered pageout. @@ -522,7 +550,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* set p to null to signal final swap i/o */ p = NULL; - nextpg = NULL; } } if (p) { /* if (we have a new page to consider) */ @@ -530,7 +557,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc * * we are below target and have a new page to consider. */ uvmexp.pdscans++; - nextpg = TAILQ_NEXT(p, pageq); /* * If we are not short on memory and only interested @@ -563,8 +589,10 @@ uvmpd_scan_inactive(struct uvm_pmalloc * * and skip to next page. */ if (pmap_is_referenced(p)) { + uvm_unlock_pageq(); uvm_pageactivate(p); rw_exit(slock); + uvm_lock_pageq(); uvmexp.pdreact++; continue; } @@ -596,6 +624,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* zap all mappings with pmap_page_protect... */ pmap_page_protect(p, PROT_NONE); + /* dequeue first to prevent lock recursion */ + uvm_pagedequeue(p); uvm_pagefree(p); freed++; @@ -633,8 +663,10 @@ uvmpd_scan_inactive(struct uvm_pmalloc * */ if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) { dirtyreacts++; + uvm_unlock_pageq(); uvm_pageactivate(p); rw_exit(slock); + uvm_lock_pageq(); continue; } @@ -772,26 +804,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc * * async I/O is in progress and the async I/O done routine * will clean up after us. in this case we move on to the * next page. - * - * there is a very remote chance that the pending async i/o can - * finish _before_ we get here. if that happens, our page "p" - * may no longer be on the inactive queue. so we verify this - * when determining the next page (starting over at the head if - * we've lost our inactive page). */ - if (result == VM_PAGER_PEND) { atomic_add_int(&uvmexp.paging, npages); uvm_lock_pageq(); uvmexp.pdpending++; - if (p) { - if (p->pg_flags & PQ_INACTIVE) - nextpg = TAILQ_NEXT(p, pageq); - else - nextpg = TAILQ_FIRST(pglst); - } else { - nextpg = NULL; - } continue; } @@ -851,13 +868,9 @@ uvmpd_scan_inactive(struct uvm_pmalloc * uvm_anfree(anon); /* kills anon */ pmap_page_protect(p, PROT_NONE); anon = NULL; - uvm_lock_pageq(); - nextpg = TAILQ_NEXT(p, pageq); /* free released page */ uvm_pagefree(p); } else { /* page was not released during I/O */ - uvm_lock_pageq(); - nextpg = TAILQ_NEXT(p, pageq); if (result != VM_PAGER_OK) { /* pageout was a failure... */ if (result != VM_PAGER_AGAIN) @@ -871,33 +884,15 @@ uvmpd_scan_inactive(struct uvm_pmalloc * PG_CLEAN); } } - - /* - * drop object lock (if there is an object left). do - * a safety check of nextpg to make sure it is on the - * inactive queue (it should be since PG_BUSY pages on - * the inactive queue can't be re-queued [note: not - * true for active queue]). - */ rw_exit(slock); - - if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) { - nextpg = TAILQ_FIRST(pglst); /* reload! */ - } - } else { - /* - * if p is null in this loop, make sure it stays null - * in the next loop. - */ - nextpg = NULL; - - /* - * lock page queues here just so they're always locked - * at the end of the loop. - */ - uvm_lock_pageq(); } + /* + * lock page queues here just so they're always locked + * at the end of the loop. + */ + uvm_lock_pageq(); } + TAILQ_REMOVE(pglst, &iter, pageq); return freed; } @@ -1019,8 +1014,9 @@ uvmpd_scan_active(struct uvm_pmalloc *pm * inactive pages. */ if (inactive_shortage > 0) { - /* no need to check wire_count as pg is "active" */ + uvm_unlock_pageq(); uvm_pagedeactivate(p); + uvm_lock_pageq(); uvmexp.pddeact++; inactive_shortage--; } @@ -1055,7 +1051,6 @@ uvmpd_drop(struct pglist *pglst) struct uvm_object * uobj = p->uobject; rw_enter(uobj->vmobjlock, RW_WRITE); - uvm_lock_pageq(); /* * we now have the page queues locked. * the page is not busy. if the page is clean we @@ -1071,7 +1066,6 @@ uvmpd_drop(struct pglist *pglst) pmap_page_protect(p, PROT_NONE); uvm_pagefree(p); } - uvm_unlock_pageq(); rw_exit(uobj->vmobjlock); } } Index: uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v diff -u -p -r1.173 uvm_swap.c --- uvm/uvm_swap.c 7 Nov 2024 09:04:55 -0000 1.173 +++ uvm/uvm_swap.c 4 Mar 2025 13:02:35 -0000 @@ -395,10 +395,8 @@ uvm_swap_freepages(struct vm_page **pps, return; } - uvm_lock_pageq(); for (i = 0; i < npages; i++) uvm_pagefree(pps[i]); - uvm_unlock_pageq(); } Index: uvm/uvm_vnode.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v diff -u -p -r1.138 uvm_vnode.c --- uvm/uvm_vnode.c 27 Dec 2024 12:04:40 -0000 1.138 +++ uvm/uvm_vnode.c 4 Mar 2025 13:02:35 -0000 @@ -602,13 +602,11 @@ uvn_flush(struct uvm_object *uobj, voff_ struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; struct vm_page *pp, *ptmp; struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; - struct pglist dead; int npages, result, lcv; boolean_t retval, need_iosync, needs_clean; voff_t curoff; KASSERT(rw_write_held(uobj->vmobjlock)); - TAILQ_INIT(&dead); /* get init vals and determine how we are going to traverse object */ need_iosync = FALSE; @@ -682,9 +680,9 @@ uvn_flush(struct uvm_object *uobj, voff_ /* if we don't need a clean, deactivate/free pages then cont. */ if (!needs_clean) { if (flags & PGO_DEACTIVATE) { - if (pp->wire_count == 0) { - uvm_pagedeactivate(pp); - } + uvm_unlock_pageq(); + uvm_pagedeactivate(pp); + uvm_lock_pageq(); } else if (flags & PGO_FREE) { if (pp->pg_flags & PG_BUSY) { uvm_unlock_pageq(); @@ -696,9 +694,9 @@ uvn_flush(struct uvm_object *uobj, voff_ continue; } else { pmap_page_protect(pp, PROT_NONE); - /* removed page from object */ - uvm_pageclean(pp); - TAILQ_INSERT_HEAD(&dead, pp, pageq); + /* dequeue to prevent lock recursion */ + uvm_pagedequeue(pp); + uvm_pagefree(pp); } } continue; @@ -807,9 +805,9 @@ ReTry: /* dispose of page */ if (flags & PGO_DEACTIVATE) { - if (ptmp->wire_count == 0) { - uvm_pagedeactivate(ptmp); - } + uvm_unlock_pageq(); + uvm_pagedeactivate(ptmp); + uvm_lock_pageq(); } else if (flags & PGO_FREE && result != VM_PAGER_PEND) { if (result != VM_PAGER_OK) { @@ -830,8 +828,9 @@ ReTry: retval = FALSE; } pmap_page_protect(ptmp, PROT_NONE); - uvm_pageclean(ptmp); - TAILQ_INSERT_TAIL(&dead, ptmp, pageq); + /* dequeue first to prevent lock recursion */ + uvm_pagedequeue(ptmp); + uvm_pagefree(ptmp); } } /* end of "lcv" for loop */ @@ -852,8 +851,6 @@ ReTry: wakeup(&uvn->u_flags); uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED); } - - uvm_pglistfree(&dead); return retval; } Index: uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v diff -u -p -r1.17 uvmexp.h --- uvm/uvmexp.h 25 Feb 2025 11:29:17 -0000 1.17 +++ uvm/uvmexp.h 4 Mar 2025 13:02:35 -0000 @@ -61,7 +61,7 @@ struct uvmexp { int active; /* [L] # of active pages */ int inactive; /* [L] # of pages that we free'd but may want back */ int paging; /* [a] # of pages in the process of being paged out */ - int wired; /* number of wired pages */ + int wired; /* [a] number of wired pages */ int zeropages; /* [F] number of zero'd pages */ int reserve_pagedaemon; /* [I] # of pages reserved for pagedaemon */