From: Mark Kettenis Subject: Re: Parrallel upper fault To: Martin Pieuchot Cc: tech@openbsd.org Date: Sun, 30 Mar 2025 16:02:08 +0200 > Date: Sun, 30 Mar 2025 15:39:46 +0200 > From: Martin Pieuchot > > On 12/03/25(Wed) 14:33, Martin Pieuchot wrote: > > Diff below includes all the necessary bits, ported mostly from NetBSD, > > to run the upper part of the fault handler in parallel. It has been > > tested by many as part of a larger diff and I'd like to commit it > > without enabling it by default. > > > > Running the fault handler in parallel does work and it reduces the > > contention and sleeping time related to access of UVM data structures. > > > > That said, running the handler in parallel leads to a contention shifts > > on the KERNEL_LOCK at the boundary of the VFS. Obviously this doesn't > > help performances or even degrades them if the number of CPUs is too high. > > > > If you want to properly test this diff you'll need "anon pool cache" > > diff as well, otherwise all anon allocations will content on the pool's > > mutex. > > > > Comments? Oks? > > Anyone? I think this should wait until after the release. > > Index: uvm/uvm_anon.c > > =================================================================== > > RCS file: /cvs/src/sys/uvm/uvm_anon.c,v > > diff -u -p -r1.62 uvm_anon.c > > --- uvm/uvm_anon.c 10 Mar 2025 14:13:58 -0000 1.62 > > +++ uvm/uvm_anon.c 12 Mar 2025 13:14:33 -0000 > > @@ -177,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st > > * anon was freed. > > */ > > return FALSE; > > + case ENOLCK: > > + /* Should not be possible. */ > > default: > > #ifdef DIAGNOSTIC > > panic("anon_pagein: uvmfault_anonget -> %d", rv); > > Index: uvm/uvm_fault.c > > =================================================================== > > RCS file: /cvs/src/sys/uvm/uvm_fault.c,v > > diff -u -p -r1.165 uvm_fault.c > > --- uvm/uvm_fault.c 10 Mar 2025 14:13:58 -0000 1.165 > > +++ uvm/uvm_fault.c 12 Mar 2025 13:19:00 -0000 > > @@ -277,6 +277,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > > struct vm_anon *anon) > > { > > struct vm_page *pg; > > + int lock_type; > > int error; > > > > KASSERT(rw_lock_held(anon->an_lock)); > > @@ -305,6 +306,7 @@ uvmfault_anonget(struct uvm_faultinfo *u > > /* > > * Is page resident? Make sure it is not busy/released. > > */ > > + lock_type = rw_status(anon->an_lock); > > if (pg) { > > KASSERT(pg->pg_flags & PQ_ANON); > > KASSERT(pg->uanon == anon); > > @@ -326,8 +328,13 @@ uvmfault_anonget(struct uvm_faultinfo *u > > uvm_pagewait(pg, anon->an_lock, "anonget"); > > } else { > > /* > > - * No page, therefore allocate one. > > + * No page, therefore allocate one. A write lock is > > + * required for this. If the caller didn't supply > > + * one, fail now and have them retry. > > */ > > + if (lock_type == RW_READ) { > > + return ENOLCK; > > + } > > pg = uvm_pagealloc(NULL, 0, anon, 0); > > if (pg == NULL) { > > /* Out of memory. Wait a little. */ > > @@ -498,6 +505,7 @@ uvmfault_promote(struct uvm_faultinfo *u > > if (uobjpage != PGO_DONTCARE) > > uobj = uobjpage->uobject; > > > > + KASSERT(rw_write_held(amap->am_lock)); > > KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); > > > > anon = uvm_analloc(); > > @@ -609,6 +617,7 @@ struct uvm_faultctx { > > boolean_t wired; > > paddr_t pa_flags; > > boolean_t promote; > > + int upper_lock_type; > > int lower_lock_type; > > }; > > > > @@ -653,6 +662,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad > > flt.access_type = access_type; > > flt.narrow = FALSE; /* assume normal fault for now */ > > flt.wired = FALSE; /* assume non-wired fault for now */ > > + flt.upper_lock_type = RW_READ; > > #if notyet > > flt.lower_lock_type = RW_READ; /* shared lock for now */ > > #else > > @@ -840,7 +850,13 @@ uvm_fault_check(struct uvm_faultinfo *uf > > * if we've got an amap then lock it and extract current anons. > > */ > > if (amap) { > > - amap_lock(amap, RW_WRITE); > > + if ((flt->access_type & PROT_WRITE) != 0) { > > + /* > > + * assume we're about to COW. > > + */ > > + flt->upper_lock_type = RW_WRITE; > > + } > > + amap_lock(amap, flt->upper_lock_type); > > amap_lookups(&ufi->entry->aref, > > flt->startva - ufi->entry->start, *ranons, flt->npages); > > } else { > > @@ -892,6 +908,36 @@ uvm_fault_check(struct uvm_faultinfo *uf > > } > > > > /* > > + * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer > > + */ > > +static inline int > > +uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap) > > +{ > > + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); > > + > > + /* > > + * fast path. > > + */ > > + if (flt->upper_lock_type == RW_WRITE) { > > + return 0; > > + } > > + > > + /* > > + * otherwise try for the upgrade. if we don't get it, unlock > > + * everything, restart the fault and next time around get a writer > > + * lock. > > + */ > > + flt->upper_lock_type = RW_WRITE; > > + if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) { > > + counters_inc(uvmexp_counters, flt_noup); > > + return ERESTART; > > + } > > + counters_inc(uvmexp_counters, flt_up); > > + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); > > + return 0; > > +} > > + > > +/* > > * uvm_fault_upper_lookup: look up existing h/w mapping and amap. > > * > > * iterate range of interest: > > @@ -914,9 +960,8 @@ uvm_fault_upper_lookup(struct uvm_faulti > > paddr_t pa; > > int lcv, entered = 0; > > > > - /* locked: maps(read), amap(if there) */ > > KASSERT(amap == NULL || > > - rw_write_held(amap->am_lock)); > > + rw_status(amap->am_lock) == flt->upper_lock_type); > > > > /* > > * map in the backpages and frontpages we found in the amap in hopes > > @@ -998,8 +1043,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > struct vm_page *pg = NULL; > > int error, ret; > > > > - /* locked: maps(read), amap, anon */ > > - KASSERT(rw_write_held(amap->am_lock)); > > + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); > > KASSERT(anon->an_lock == amap->am_lock); > > > > /* > > @@ -1012,6 +1056,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > * if it succeeds, locks are still valid and locked. > > * also, if it is OK, then the anon's page is on the queues. > > */ > > +retry: > > error = uvmfault_anonget(ufi, amap, anon); > > switch (error) { > > case 0: > > @@ -1020,11 +1065,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > case ERESTART: > > return ERESTART; > > > > + case ENOLCK: > > + /* it needs a write lock: retry */ > > + error = uvm_fault_upper_upgrade(flt, amap); > > + if (error != 0) { > > + uvmfault_unlockall(ufi, amap, NULL); > > + return error; > > + } > > + KASSERT(rw_write_held(amap->am_lock)); > > + goto retry; > > + > > default: > > return error; > > } > > > > - KASSERT(rw_write_held(amap->am_lock)); > > + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); > > KASSERT(anon->an_lock == amap->am_lock); > > > > /* > > @@ -1039,9 +1094,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > * > > * if we are out of anon VM we wait for RAM to become available. > > */ > > - > > if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { > > /* promoting requires a write lock. */ > > + error = uvm_fault_upper_upgrade(flt, amap); > > + if (error != 0) { > > + uvmfault_unlockall(ufi, amap, NULL); > > + return error; > > + } > > KASSERT(rw_write_held(amap->am_lock)); > > > > counters_inc(uvmexp_counters, flt_acow); > > @@ -1064,6 +1123,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > KASSERT(oanon->an_ref > 1); > > oanon->an_ref--; > > > > + /* > > + * note: oanon is still locked, as is the new anon. we > > + * need to check for this later when we unlock oanon; if > > + * oanon != anon, we'll have to unlock anon, too. > > + */ > > + KASSERT(anon->an_lock == amap->am_lock); > > + KASSERT(oanon->an_lock == amap->am_lock); > > + > > #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW) > > /* > > * If there are multiple threads, either uvm or the > > @@ -1078,12 +1145,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf > > flt->access_type &= ~PROT_WRITE; > > } > > #endif > > - > > - /* > > - * note: anon is _not_ locked, but we have the sole references > > - * to in from amap. > > - * thus, no one can get at it until we are done with it. > > - */ > > } else { > > counters_inc(uvmexp_counters, flt_anon); > > oanon = anon; > > @@ -1246,10 +1307,8 @@ uvm_fault_lower_lookup( > > * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer > > */ > > static inline int > > -uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, > > - struct vm_amap *amap, struct uvm_object *uobj) > > +uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj) > > { > > - KASSERT(uobj != NULL); > > KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock)); > > > > /* > > @@ -1265,7 +1324,6 @@ uvm_fault_lower_upgrade(struct uvm_fault > > */ > > flt->lower_lock_type = RW_WRITE; > > if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) { > > - uvmfault_unlockall(ufi, amap, uobj); > > counters_inc(uvmexp_counters, flt_noup); > > return ERESTART; > > } > > @@ -1319,11 +1377,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf > > * made it BUSY. > > */ > > > > - /* > > - * locked: > > - */ > > KASSERT(amap == NULL || > > - rw_write_held(amap->am_lock)); > > + rw_status(amap->am_lock) == flt->upper_lock_type); > > KASSERT(uobj == NULL || > > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > > > > @@ -1392,6 +1447,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf > > KASSERT(amap != NULL); > > > > /* promoting requires a write lock. */ > > + error = uvm_fault_upper_upgrade(flt, amap); > > + if (error != 0) { > > + uvmfault_unlockall(ufi, amap, uobj); > > + return error; > > + } > > KASSERT(rw_write_held(amap->am_lock)); > > KASSERT(uobj == NULL || > > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > > @@ -1468,7 +1528,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf > > * Note: pg is either the uobjpage or the new page in the new anon. > > */ > > KASSERT(amap == NULL || > > - rw_write_held(amap->am_lock)); > > + rw_status(amap->am_lock) == flt->upper_lock_type); > > KASSERT(uobj == NULL || > > rw_status(uobj->vmobjlock) == flt->lower_lock_type); > > KASSERT(anon == NULL || anon->an_lock == amap->am_lock); > > @@ -1572,9 +1632,11 @@ uvm_fault_lower_io( > > advice = ufi->entry->advice; > > > > /* Upgrade to a write lock if needed. */ > > - error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj); > > - if (error != 0) > > + error = uvm_fault_lower_upgrade(flt, uobj); > > + if (error != 0) { > > + uvmfault_unlockall(ufi, amap, uobj); > > return error; > > + } > > uvmfault_unlockall(ufi, amap, NULL); > > > > /* update rusage counters */ > > @@ -1610,7 +1672,7 @@ uvm_fault_lower_io( > > /* re-verify the state of the world. */ > > locked = uvmfault_relock(ufi); > > if (locked && amap != NULL) > > - amap_lock(amap, RW_WRITE); > > + amap_lock(amap, flt->upper_lock_type); > > > > /* might be changed */ > > if (pg != PGO_DONTCARE) { > > Index: uvm/uvm_page.c > > =================================================================== > > RCS file: /cvs/src/sys/uvm/uvm_page.c,v > > diff -u -p -r1.182 uvm_page.c > > --- uvm/uvm_page.c 10 Mar 2025 14:13:58 -0000 1.182 > > +++ uvm/uvm_page.c 12 Mar 2025 13:14:33 -0000 > > @@ -1366,7 +1366,9 @@ uvm_page_owner_locked_p(struct vm_page * > > : rw_lock_held(pg->uobject->vmobjlock); > > } > > if (pg->uanon != NULL) { > > - return rw_write_held(pg->uanon->an_lock); > > + return exclusive > > + ? rw_write_held(pg->uanon->an_lock) > > + : rw_lock_held(pg->uanon->an_lock); > > } > > return 1; > > } > > > > > > >