From: Martin Pieuchot Subject: Re: Parrallel upper fault To: tech@openbsd.org Date: Mon, 14 Apr 2025 09:47:34 +0200 On 12/03/25(Wed) 14:33, Martin Pieuchot wrote: > Diff below includes all the necessary bits, ported mostly from NetBSD, > to run the upper part of the fault handler in parallel. It has been > tested by many as part of a larger diff and I'd like to commit it > without enabling it by default. > > Running the fault handler in parallel does work and it reduces the > contention and sleeping time related to access of UVM data structures. > > That said, running the handler in parallel leads to a contention shifts > on the KERNEL_LOCK at the boundary of the VFS. Obviously this doesn't > help performances or even degrades them if the number of CPUs is too high. > > If you want to properly test this diff you'll need "anon pool cache" > diff as well, otherwise all anon allocations will content on the pool's > mutex. ok? Index: uvm/uvm_anon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_anon.c,v diff -u -p -r1.62 uvm_anon.c --- uvm/uvm_anon.c 10 Mar 2025 14:13:58 -0000 1.62 +++ uvm/uvm_anon.c 12 Mar 2025 13:14:33 -0000 @@ -177,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st * anon was freed. */ return FALSE; + case ENOLCK: + /* Should not be possible. */ default: #ifdef DIAGNOSTIC panic("anon_pagein: uvmfault_anonget -> %d", rv); Index: uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v diff -u -p -r1.165 uvm_fault.c --- uvm/uvm_fault.c 10 Mar 2025 14:13:58 -0000 1.165 +++ uvm/uvm_fault.c 12 Mar 2025 13:19:00 -0000 @@ -277,6 +277,7 @@ uvmfault_anonget(struct uvm_faultinfo *u struct vm_anon *anon) { struct vm_page *pg; + int lock_type; int error; KASSERT(rw_lock_held(anon->an_lock)); @@ -305,6 +306,7 @@ uvmfault_anonget(struct uvm_faultinfo *u /* * Is page resident? Make sure it is not busy/released. */ + lock_type = rw_status(anon->an_lock); if (pg) { KASSERT(pg->pg_flags & PQ_ANON); KASSERT(pg->uanon == anon); @@ -326,8 +328,13 @@ uvmfault_anonget(struct uvm_faultinfo *u uvm_pagewait(pg, anon->an_lock, "anonget"); } else { /* - * No page, therefore allocate one. + * No page, therefore allocate one. A write lock is + * required for this. If the caller didn't supply + * one, fail now and have them retry. */ + if (lock_type == RW_READ) { + return ENOLCK; + } pg = uvm_pagealloc(NULL, 0, anon, 0); if (pg == NULL) { /* Out of memory. Wait a little. */ @@ -498,6 +505,7 @@ uvmfault_promote(struct uvm_faultinfo *u if (uobjpage != PGO_DONTCARE) uobj = uobjpage->uobject; + KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); anon = uvm_analloc(); @@ -609,6 +617,7 @@ struct uvm_faultctx { boolean_t wired; paddr_t pa_flags; boolean_t promote; + int upper_lock_type; int lower_lock_type; }; @@ -653,6 +662,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad flt.access_type = access_type; flt.narrow = FALSE; /* assume normal fault for now */ flt.wired = FALSE; /* assume non-wired fault for now */ + flt.upper_lock_type = RW_READ; #if notyet flt.lower_lock_type = RW_READ; /* shared lock for now */ #else @@ -840,7 +850,13 @@ uvm_fault_check(struct uvm_faultinfo *uf * if we've got an amap then lock it and extract current anons. */ if (amap) { - amap_lock(amap, RW_WRITE); + if ((flt->access_type & PROT_WRITE) != 0) { + /* + * assume we're about to COW. + */ + flt->upper_lock_type = RW_WRITE; + } + amap_lock(amap, flt->upper_lock_type); amap_lookups(&ufi->entry->aref, flt->startva - ufi->entry->start, *ranons, flt->npages); } else { @@ -892,6 +908,36 @@ uvm_fault_check(struct uvm_faultinfo *uf } /* + * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer + */ +static inline int +uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap) +{ + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); + + /* + * fast path. + */ + if (flt->upper_lock_type == RW_WRITE) { + return 0; + } + + /* + * otherwise try for the upgrade. if we don't get it, unlock + * everything, restart the fault and next time around get a writer + * lock. + */ + flt->upper_lock_type = RW_WRITE; + if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) { + counters_inc(uvmexp_counters, flt_noup); + return ERESTART; + } + counters_inc(uvmexp_counters, flt_up); + KASSERT(flt->upper_lock_type == rw_status(amap->am_lock)); + return 0; +} + +/* * uvm_fault_upper_lookup: look up existing h/w mapping and amap. * * iterate range of interest: @@ -914,9 +960,8 @@ uvm_fault_upper_lookup(struct uvm_faulti paddr_t pa; int lcv, entered = 0; - /* locked: maps(read), amap(if there) */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); /* * map in the backpages and frontpages we found in the amap in hopes @@ -998,8 +1043,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf struct vm_page *pg = NULL; int error, ret; - /* locked: maps(read), amap, anon */ - KASSERT(rw_write_held(amap->am_lock)); + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); /* @@ -1012,6 +1056,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf * if it succeeds, locks are still valid and locked. * also, if it is OK, then the anon's page is on the queues. */ +retry: error = uvmfault_anonget(ufi, amap, anon); switch (error) { case 0: @@ -1020,11 +1065,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf case ERESTART: return ERESTART; + case ENOLCK: + /* it needs a write lock: retry */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, NULL); + return error; + } + KASSERT(rw_write_held(amap->am_lock)); + goto retry; + default: return error; } - KASSERT(rw_write_held(amap->am_lock)); + KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); /* @@ -1039,9 +1094,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf * * if we are out of anon VM we wait for RAM to become available. */ - if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) { /* promoting requires a write lock. */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, NULL); + return error; + } KASSERT(rw_write_held(amap->am_lock)); counters_inc(uvmexp_counters, flt_acow); @@ -1064,6 +1123,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf KASSERT(oanon->an_ref > 1); oanon->an_ref--; + /* + * note: oanon is still locked, as is the new anon. we + * need to check for this later when we unlock oanon; if + * oanon != anon, we'll have to unlock anon, too. + */ + KASSERT(anon->an_lock == amap->am_lock); + KASSERT(oanon->an_lock == amap->am_lock); + #if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW) /* * If there are multiple threads, either uvm or the @@ -1078,12 +1145,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf flt->access_type &= ~PROT_WRITE; } #endif - - /* - * note: anon is _not_ locked, but we have the sole references - * to in from amap. - * thus, no one can get at it until we are done with it. - */ } else { counters_inc(uvmexp_counters, flt_anon); oanon = anon; @@ -1246,10 +1307,8 @@ uvm_fault_lower_lookup( * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer */ static inline int -uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt, - struct vm_amap *amap, struct uvm_object *uobj) +uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj) { - KASSERT(uobj != NULL); KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock)); /* @@ -1265,7 +1324,6 @@ uvm_fault_lower_upgrade(struct uvm_fault */ flt->lower_lock_type = RW_WRITE; if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) { - uvmfault_unlockall(ufi, amap, uobj); counters_inc(uvmexp_counters, flt_noup); return ERESTART; } @@ -1319,11 +1377,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf * made it BUSY. */ - /* - * locked: - */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); @@ -1392,6 +1447,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf KASSERT(amap != NULL); /* promoting requires a write lock. */ + error = uvm_fault_upper_upgrade(flt, amap); + if (error != 0) { + uvmfault_unlockall(ufi, amap, uobj); + return error; + } KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); @@ -1468,7 +1528,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf * Note: pg is either the uobjpage or the new page in the new anon. */ KASSERT(amap == NULL || - rw_write_held(amap->am_lock)); + rw_status(amap->am_lock) == flt->upper_lock_type); KASSERT(uobj == NULL || rw_status(uobj->vmobjlock) == flt->lower_lock_type); KASSERT(anon == NULL || anon->an_lock == amap->am_lock); @@ -1572,9 +1632,11 @@ uvm_fault_lower_io( advice = ufi->entry->advice; /* Upgrade to a write lock if needed. */ - error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj); - if (error != 0) + error = uvm_fault_lower_upgrade(flt, uobj); + if (error != 0) { + uvmfault_unlockall(ufi, amap, uobj); return error; + } uvmfault_unlockall(ufi, amap, NULL); /* update rusage counters */ @@ -1610,7 +1672,7 @@ uvm_fault_lower_io( /* re-verify the state of the world. */ locked = uvmfault_relock(ufi); if (locked && amap != NULL) - amap_lock(amap, RW_WRITE); + amap_lock(amap, flt->upper_lock_type); /* might be changed */ if (pg != PGO_DONTCARE) { Index: uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v diff -u -p -r1.182 uvm_page.c --- uvm/uvm_page.c 10 Mar 2025 14:13:58 -0000 1.182 +++ uvm/uvm_page.c 12 Mar 2025 13:14:33 -0000 @@ -1366,7 +1366,9 @@ uvm_page_owner_locked_p(struct vm_page * : rw_lock_held(pg->uobject->vmobjlock); } if (pg->uanon != NULL) { - return rw_write_held(pg->uanon->an_lock); + return exclusive + ? rw_write_held(pg->uanon->an_lock) + : rw_lock_held(pg->uanon->an_lock); } return 1; }