Download raw body.
Parrallel upper fault
Diff below includes all the necessary bits, ported mostly from NetBSD,
to run the upper part of the fault handler in parallel. It has been
tested by many as part of a larger diff and I'd like to commit it
without enabling it by default.
Running the fault handler in parallel does work and it reduces the
contention and sleeping time related to access of UVM data structures.
That said, running the handler in parallel leads to a contention shifts
on the KERNEL_LOCK at the boundary of the VFS. Obviously this doesn't
help performances or even degrades them if the number of CPUs is too high.
If you want to properly test this diff you'll need "anon pool cache"
diff as well, otherwise all anon allocations will content on the pool's
mutex.
Comments? Oks?
Index: uvm/uvm_anon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_anon.c,v
diff -u -p -r1.62 uvm_anon.c
--- uvm/uvm_anon.c 10 Mar 2025 14:13:58 -0000 1.62
+++ uvm/uvm_anon.c 12 Mar 2025 13:14:33 -0000
@@ -177,6 +177,8 @@ uvm_anon_pagein(struct vm_amap *amap, st
* anon was freed.
*/
return FALSE;
+ case ENOLCK:
+ /* Should not be possible. */
default:
#ifdef DIAGNOSTIC
panic("anon_pagein: uvmfault_anonget -> %d", rv);
Index: uvm/uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
diff -u -p -r1.165 uvm_fault.c
--- uvm/uvm_fault.c 10 Mar 2025 14:13:58 -0000 1.165
+++ uvm/uvm_fault.c 12 Mar 2025 13:19:00 -0000
@@ -277,6 +277,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
struct vm_anon *anon)
{
struct vm_page *pg;
+ int lock_type;
int error;
KASSERT(rw_lock_held(anon->an_lock));
@@ -305,6 +306,7 @@ uvmfault_anonget(struct uvm_faultinfo *u
/*
* Is page resident? Make sure it is not busy/released.
*/
+ lock_type = rw_status(anon->an_lock);
if (pg) {
KASSERT(pg->pg_flags & PQ_ANON);
KASSERT(pg->uanon == anon);
@@ -326,8 +328,13 @@ uvmfault_anonget(struct uvm_faultinfo *u
uvm_pagewait(pg, anon->an_lock, "anonget");
} else {
/*
- * No page, therefore allocate one.
+ * No page, therefore allocate one. A write lock is
+ * required for this. If the caller didn't supply
+ * one, fail now and have them retry.
*/
+ if (lock_type == RW_READ) {
+ return ENOLCK;
+ }
pg = uvm_pagealloc(NULL, 0, anon, 0);
if (pg == NULL) {
/* Out of memory. Wait a little. */
@@ -498,6 +505,7 @@ uvmfault_promote(struct uvm_faultinfo *u
if (uobjpage != PGO_DONTCARE)
uobj = uobjpage->uobject;
+ KASSERT(rw_write_held(amap->am_lock));
KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
anon = uvm_analloc();
@@ -609,6 +617,7 @@ struct uvm_faultctx {
boolean_t wired;
paddr_t pa_flags;
boolean_t promote;
+ int upper_lock_type;
int lower_lock_type;
};
@@ -653,6 +662,7 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad
flt.access_type = access_type;
flt.narrow = FALSE; /* assume normal fault for now */
flt.wired = FALSE; /* assume non-wired fault for now */
+ flt.upper_lock_type = RW_READ;
#if notyet
flt.lower_lock_type = RW_READ; /* shared lock for now */
#else
@@ -840,7 +850,13 @@ uvm_fault_check(struct uvm_faultinfo *uf
* if we've got an amap then lock it and extract current anons.
*/
if (amap) {
- amap_lock(amap, RW_WRITE);
+ if ((flt->access_type & PROT_WRITE) != 0) {
+ /*
+ * assume we're about to COW.
+ */
+ flt->upper_lock_type = RW_WRITE;
+ }
+ amap_lock(amap, flt->upper_lock_type);
amap_lookups(&ufi->entry->aref,
flt->startva - ufi->entry->start, *ranons, flt->npages);
} else {
@@ -892,6 +908,36 @@ uvm_fault_check(struct uvm_faultinfo *uf
}
/*
+ * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
+ */
+static inline int
+uvm_fault_upper_upgrade(struct uvm_faultctx *flt, struct vm_amap *amap)
+{
+ KASSERT(flt->upper_lock_type == rw_status(amap->am_lock));
+
+ /*
+ * fast path.
+ */
+ if (flt->upper_lock_type == RW_WRITE) {
+ return 0;
+ }
+
+ /*
+ * otherwise try for the upgrade. if we don't get it, unlock
+ * everything, restart the fault and next time around get a writer
+ * lock.
+ */
+ flt->upper_lock_type = RW_WRITE;
+ if (rw_enter(amap->am_lock, RW_UPGRADE|RW_NOSLEEP)) {
+ counters_inc(uvmexp_counters, flt_noup);
+ return ERESTART;
+ }
+ counters_inc(uvmexp_counters, flt_up);
+ KASSERT(flt->upper_lock_type == rw_status(amap->am_lock));
+ return 0;
+}
+
+/*
* uvm_fault_upper_lookup: look up existing h/w mapping and amap.
*
* iterate range of interest:
@@ -914,9 +960,8 @@ uvm_fault_upper_lookup(struct uvm_faulti
paddr_t pa;
int lcv, entered = 0;
- /* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
/*
* map in the backpages and frontpages we found in the amap in hopes
@@ -998,8 +1043,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
struct vm_page *pg = NULL;
int error, ret;
- /* locked: maps(read), amap, anon */
- KASSERT(rw_write_held(amap->am_lock));
+ KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(anon->an_lock == amap->am_lock);
/*
@@ -1012,6 +1056,7 @@ uvm_fault_upper(struct uvm_faultinfo *uf
* if it succeeds, locks are still valid and locked.
* also, if it is OK, then the anon's page is on the queues.
*/
+retry:
error = uvmfault_anonget(ufi, amap, anon);
switch (error) {
case 0:
@@ -1020,11 +1065,21 @@ uvm_fault_upper(struct uvm_faultinfo *uf
case ERESTART:
return ERESTART;
+ case ENOLCK:
+ /* it needs a write lock: retry */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, NULL);
+ return error;
+ }
+ KASSERT(rw_write_held(amap->am_lock));
+ goto retry;
+
default:
return error;
}
- KASSERT(rw_write_held(amap->am_lock));
+ KASSERT(rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(anon->an_lock == amap->am_lock);
/*
@@ -1039,9 +1094,13 @@ uvm_fault_upper(struct uvm_faultinfo *uf
*
* if we are out of anon VM we wait for RAM to become available.
*/
-
if ((flt->access_type & PROT_WRITE) != 0 && anon->an_ref > 1) {
/* promoting requires a write lock. */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, NULL);
+ return error;
+ }
KASSERT(rw_write_held(amap->am_lock));
counters_inc(uvmexp_counters, flt_acow);
@@ -1064,6 +1123,14 @@ uvm_fault_upper(struct uvm_faultinfo *uf
KASSERT(oanon->an_ref > 1);
oanon->an_ref--;
+ /*
+ * note: oanon is still locked, as is the new anon. we
+ * need to check for this later when we unlock oanon; if
+ * oanon != anon, we'll have to unlock anon, too.
+ */
+ KASSERT(anon->an_lock == amap->am_lock);
+ KASSERT(oanon->an_lock == amap->am_lock);
+
#if defined(MULTIPROCESSOR) && !defined(__HAVE_PMAP_MPSAFE_ENTER_COW)
/*
* If there are multiple threads, either uvm or the
@@ -1078,12 +1145,6 @@ uvm_fault_upper(struct uvm_faultinfo *uf
flt->access_type &= ~PROT_WRITE;
}
#endif
-
- /*
- * note: anon is _not_ locked, but we have the sole references
- * to in from amap.
- * thus, no one can get at it until we are done with it.
- */
} else {
counters_inc(uvmexp_counters, flt_anon);
oanon = anon;
@@ -1246,10 +1307,8 @@ uvm_fault_lower_lookup(
* uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
*/
static inline int
-uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
- struct vm_amap *amap, struct uvm_object *uobj)
+uvm_fault_lower_upgrade(struct uvm_faultctx *flt, struct uvm_object *uobj)
{
- KASSERT(uobj != NULL);
KASSERT(flt->lower_lock_type == rw_status(uobj->vmobjlock));
/*
@@ -1265,7 +1324,6 @@ uvm_fault_lower_upgrade(struct uvm_fault
*/
flt->lower_lock_type = RW_WRITE;
if (rw_enter(uobj->vmobjlock, RW_UPGRADE|RW_NOSLEEP)) {
- uvmfault_unlockall(ufi, amap, uobj);
counters_inc(uvmexp_counters, flt_noup);
return ERESTART;
}
@@ -1319,11 +1377,8 @@ uvm_fault_lower(struct uvm_faultinfo *uf
* made it BUSY.
*/
- /*
- * locked:
- */
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
@@ -1392,6 +1447,11 @@ uvm_fault_lower(struct uvm_faultinfo *uf
KASSERT(amap != NULL);
/* promoting requires a write lock. */
+ error = uvm_fault_upper_upgrade(flt, amap);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, uobj);
+ return error;
+ }
KASSERT(rw_write_held(amap->am_lock));
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
@@ -1468,7 +1528,7 @@ uvm_fault_lower(struct uvm_faultinfo *uf
* Note: pg is either the uobjpage or the new page in the new anon.
*/
KASSERT(amap == NULL ||
- rw_write_held(amap->am_lock));
+ rw_status(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_status(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
@@ -1572,9 +1632,11 @@ uvm_fault_lower_io(
advice = ufi->entry->advice;
/* Upgrade to a write lock if needed. */
- error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj);
- if (error != 0)
+ error = uvm_fault_lower_upgrade(flt, uobj);
+ if (error != 0) {
+ uvmfault_unlockall(ufi, amap, uobj);
return error;
+ }
uvmfault_unlockall(ufi, amap, NULL);
/* update rusage counters */
@@ -1610,7 +1672,7 @@ uvm_fault_lower_io(
/* re-verify the state of the world. */
locked = uvmfault_relock(ufi);
if (locked && amap != NULL)
- amap_lock(amap, RW_WRITE);
+ amap_lock(amap, flt->upper_lock_type);
/* might be changed */
if (pg != PGO_DONTCARE) {
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
diff -u -p -r1.182 uvm_page.c
--- uvm/uvm_page.c 10 Mar 2025 14:13:58 -0000 1.182
+++ uvm/uvm_page.c 12 Mar 2025 13:14:33 -0000
@@ -1366,7 +1366,9 @@ uvm_page_owner_locked_p(struct vm_page *
: rw_lock_held(pg->uobject->vmobjlock);
}
if (pg->uanon != NULL) {
- return rw_write_held(pg->uanon->an_lock);
+ return exclusive
+ ? rw_write_held(pg->uanon->an_lock)
+ : rw_lock_held(pg->uanon->an_lock);
}
return 1;
}
Parrallel upper fault