From: Martin Pieuchot Subject: Re: Parrallel upper fault To: tech@openbsd.org Date: Sun, 30 Mar 2025 15:39:46 +0200 On 12/03/25(Wed) 14:33, Martin Pieuchot wrote: > Diff below includes all the necessary bits, ported mostly from NetBSD, > to run the upper part of the fault handler in parallel. It has been > tested by many as part of a larger diff and I'd like to commit it > without enabling it by default. > > Running the fault handler in parallel does work and it reduces the > contention and sleeping time related to access of UVM data structures. > > That said, running the handler in parallel leads to a contention shifts > on the KERNEL_LOCK at the boundary of the VFS. Obviously this doesn't > help performances or even degrades them if the number of CPUs is too high. > > If you want to properly test this diff you'll need "anon pool cache" > diff as well, otherwise all anon allocations will content on the pool's > mutex. > > Comments? Oks? Anyone? > Index: uvm/uvm_anon.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_anon.c,v > diff -u -p -r1.62 uvm_anon.c > --- uvm/uvm_anon.c 10 Mar 2025 14:13:58 -0000 1.62 > +++ uvm/uvm_anon.c 12 Mar 2025 13:14:33 -0000 > @@ -177,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st > * anon was freed. > */ > return FALSE; > + case ENOLCK: > + /* Should not be possible. */ > default: > #ifdef DIAGNOSTIC > panic("anon_pagein: uvmfault_anonget -> %d", rv); > Index: uvm/uvm_fault.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > diff -u -p -r1.165 uvm_fault.c > --- uvm/uvm_fault.c 10 Mar 2025 14:13:58 -0000 1.165 > +++ uvm/uvm_fault.c 12 Mar 2025 13:19:00 -0000 > @@ -277,6 +277,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > struct vm_anon *anon) > { > struct vm_page *pg; > + int lock_type; > int error; > > KASSERT(rw_lock_held(anon->an_lock)); > @@ -305,6 +306,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > /* > * Is page resident? Make sure it is not busy/released. > */ > + lock_type = rw_status(anon->an_lock); > if (pg) { > KASSERT(pg->pg_flags & PQ_ANON); > KASSERT(pg->uanon == anon); > @@ -326,8 +328,13 @@ uvmfault_anonget(struct uvm_faultinfo *u > uvm_pagewait(pg, anon->an_lock, "anonget"); > } else { > /* > - * No page, therefore allocate one. > + * No page, therefore allocate one. A write lock is > + * required for this. If the caller didn't supply > + * one, fail now and have them retry. > */ > + if (lock_type == RW_READ) { > + return ENOLCK; > + } > pg = uvm_pagealloc(NULL, 0, anon, 0); > if (pg == NULL) { > /* Out of memory. Wait a little. */ > @@ -498,6 +505,7 @@ uvmfault_promote(struct uvm_faultinfo *u > if (uobjpage != PGO_DONTCARE) > uobj = uobjpage->uobject; > > + KASSERT(rw_write_held(amap->am_lock)); > KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); > > anon = uvm_analloc(); > @@ -609,6 +617,7 @@ struct uvm_faultctx { > boolean_t wired; > paddr_t pa_flags; > boolean_t promote; > + int upper_lock_type; > int lower_lock_type; > }; > > @@ -653,6 +662,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > flt.access_type = access_type; > flt.narrow = FALSE; /* assume normal fault for now */ > flt.wired = FALSE; /* assume non-wired fault for now */ > + flt.upper_lock_type = RW_READ; > #if notyet > flt.lower_lock_type = RW_READ; /* shared lock for now */ > #else > @@ -840,7 +850,13 @@ uvm_fault_check(struct uvm_faultinfo *uf > * if we've got an amap then lock it and extract current anons. > */ > if (amap) { > - amap_lock(amap, RW_WRITE); > + if ((flt->access_type & PROT_WRITE) != 0) { > + /* > + * assume we're about to COW. > + */ > + flt->upper_lock_type = RW_WRITE; > + } > + amap_lock(amap, flt->upper_lock_type); > amap_lookups(&ufi->entry->aref, > flt->startva - ufi->entry->start, *ranons, flt->npages); > } else { > @@ -892,6 +908,36 @@ uvm_fault_check(struct uvm_faultinfo *uf > } > > /* > + * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer > + */ > +static inline int > +uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap) > +{ > + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); > + > + /* > + * fast path. > + */ > + if (flt->upper_lock_type == RW_WRITE) { > + return 0; > + } > + > + /* > + * otherwise try for the upgrade. if we don't get it, unlock > + * everything, restart the fault and next time around get a writer > + * lock. > + */ > + flt->upper_lock_type = RW_WRITE; > + if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) { > + counters_inc(uvmexp_counters, flt_noup); > + return ERESTART; > + } > + counters_inc(uvmexp_counters, flt_up); > + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); > + return 0; > +} > + > +/* > * uvm_fault_upper_lookup: look up existing h/w mapping and amap. > * > * iterate range of interest: > @@ -914,9 +960,8 @@ uvm_fault_upper_lookup(struct uvm_faulti > paddr_t pa; > int lcv, entered = 0; > > - /* locked: maps(read), amap(if there) */ > KASSERT(amap == NULL || > - rw_write_held(amap->am_lock)); > + rw_status(amap->am_lock) == flt->upper_lock_type); > > /* > * map in the backpages and frontpages we found in the amap in hopes > @@ -998,8 +1043,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > struct vm_page *pg = NULL; > int error, ret; > > - /* locked: maps(read), amap, anon */ > - KASSERT(rw_write_held(amap->am_lock)); > + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); > KASSERT(anon->an_lock == amap->am_lock); > > /* > @@ -1012,6 +1056,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > * if it succeeds, locks are still valid and locked. > * also, if it is OK, then the anon's page is on the queues. > */ > +retry: > error = uvmfault_anonget(ufi, amap, anon); > switch (error) { > case 0: > @@ -1020,11 +1065,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf > case ERESTART: > return ERESTART; > > + case ENOLCK: > + /* it needs a write lock: retry */ > + error = uvm_fault_upper_upgrade(flt, amap); > + if (error != 0) { > + uvmfault_unlockall(ufi, amap, NULL); > + return error; > + } > + KASSERT(rw_write_held(amap->am_lock)); > + goto retry; > + > default: > return error; > } > > - KASSERT(rw_write_held(amap->am_lock)); > + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); > KASSERT(anon->an_lock == amap->am_lock); > > /* > @@ -1039,9 +1094,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf > * > * if we are out of anon VM we wait for RAM to become available. > */ > - > if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { > /* promoting requires a write lock. */ > + error = uvm_fault_upper_upgrade(flt, amap); > + if (error != 0) { > + uvmfault_unlockall(ufi, amap, NULL); > + return error; > + } > KASSERT(rw_write_held(amap->am_lock)); > > counters_inc(uvmexp_counters, flt_acow); > @@ -1064,6 +1123,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf > KASSERT(oanon->an_ref > 1); > oanon->an_ref--; > > + /* > + * note: oanon is still locked, as is the new anon. we > + * need to check for this later when we unlock oanon; if > + * oanon != anon, we'll have to unlock anon, too. > + */ > + KASSERT(anon->an_lock == amap->am_lock); > + KASSERT(oanon->an_lock == amap->am_lock); > + > #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW) > /* > * If there are multiple threads, either uvm or the > @@ -1078,12 +1145,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf > flt->access_type &= ~PROT_WRITE; > } > #endif > - > - /* > - * note: anon is _not_ locked, but we have the sole references > - * to in from amap. > - * thus, no one can get at it until we are done with it. > - */ > } else { > counters_inc(uvmexp_counters, flt_anon); > oanon = anon; > @@ -1246,10 +1307,8 @@ uvm_fault_lower_lookup( > * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer > */ > static inline int > -uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, > - struct vm_amap *amap, struct uvm_object *uobj) > +uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj) > { > - KASSERT(uobj != NULL); > KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock)); > > /* > @@ -1265,7 +1324,6 @@ uvm_fault_lower_upgrade(struct uvm_fault > */ > flt->lower_lock_type = RW_WRITE; > if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) { > - uvmfault_unlockall(ufi, amap, uobj); > counters_inc(uvmexp_counters, flt_noup); > return ERESTART; > } > @@ -1319,11 +1377,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf > * made it BUSY. > */ > > - /* > - * locked: > - */ > KASSERT(amap == NULL || > - rw_write_held(amap->am_lock)); > + rw_status(amap->am_lock) == flt->upper_lock_type); > KASSERT(uobj == NULL || > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > > @@ -1392,6 +1447,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf > KASSERT(amap != NULL); > > /* promoting requires a write lock. */ > + error = uvm_fault_upper_upgrade(flt, amap); > + if (error != 0) { > + uvmfault_unlockall(ufi, amap, uobj); > + return error; > + } > KASSERT(rw_write_held(amap->am_lock)); > KASSERT(uobj == NULL || > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > @@ -1468,7 +1528,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > * Note: pg is either the uobjpage or the new page in the new anon. > */ > KASSERT(amap == NULL || > - rw_write_held(amap->am_lock)); > + rw_status(amap->am_lock) == flt->upper_lock_type); > KASSERT(uobj == NULL || > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > KASSERT(anon == NULL || anon->an_lock == amap->am_lock); > @@ -1572,9 +1632,11 @@ uvm_fault_lower_io( > advice = ufi->entry->advice; > > /* Upgrade to a write lock if needed. */ > - error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj); > - if (error != 0) > + error = uvm_fault_lower_upgrade(flt, uobj); > + if (error != 0) { > + uvmfault_unlockall(ufi, amap, uobj); > return error; > + } > uvmfault_unlockall(ufi, amap, NULL); > > /* update rusage counters */ > @@ -1610,7 +1672,7 @@ uvm_fault_lower_io( > /* re-verify the state of the world. */ > locked = uvmfault_relock(ufi); > if (locked && amap != NULL) > - amap_lock(amap, RW_WRITE); > + amap_lock(amap, flt->upper_lock_type); > > /* might be changed */ > if (pg != PGO_DONTCARE) { > Index: uvm/uvm_page.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_page.c,v > diff -u -p -r1.182 uvm_page.c > --- uvm/uvm_page.c 10 Mar 2025 14:13:58 -0000 1.182 > +++ uvm/uvm_page.c 12 Mar 2025 13:14:33 -0000 > @@ -1366,7 +1366,9 @@ uvm_page_owner_locked_p(struct vm_page * > : rw_lock_held(pg->uobject->vmobjlock); > } > if (pg->uanon != NULL) { > - return rw_write_held(pg->uanon->an_lock); > + return exclusive > + ? rw_write_held(pg->uanon->an_lock) > + : rw_lock_held(pg->uanon->an_lock); > } > return 1; > } > >