Download raw body.
improve UVM performance on unmap
While working on prometheus I had a version where the daemon did a
munmap(2) and mmap(2) for every write to that memory map. This resulted in
high spin and system time.
For curiosity I used dtrace kprofile and it pointed the problem
straight at a very inefficent use of uvm_pagelookup(), e.g.:
uvm_objtree_RBT_COMPARE+0xb
uvm_pagelookup+0x3e
uvn_flush+0x17e
uvn_detach+0x7e
uvm_unmap_detach+0xf1
sys_munmap+0x185
syscall+0x5f9
Xsyscall+0x128
kernel
So unv_flush() is very inefficent at unmapping the file and burns most of
its time in uvm_pagelookup().
There are a few UVM functions that roughly do:
for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
pp = uvm_pagelookup(uobj, curoff);
if (pp == NULL)
contine;
...
}
This is a very expensive way to write something that is almost an
RB_FOREACH(). I went and implemented uvm_pagerangefirst() and
uvm_pagerangenext() to build an iterator that is more efficent.
As usual with these iterators if the page is removed from the RB tree the
code needs to prefetch the next element similar to how RB_FOREACH_SAFE()
works.
On top of that we have the following issue:
If there is a sleep point (which releases the object lock) prefeting is no
longer enough. Once we release the lock someone else can modify the
tree and the page we cached. So after such a sleep the loop needs to
restarted with a new start point by calling uvm_pagerangefirst().
This is what uvm_km_pgremove() does for the PG_BUSY case. uao_flush() is
very similar to uvm_km_pgremove() but was written in a very strange way.
uvm_obj_unwire() is the most trivial conversion since it does not alter
the RB tree at all.
Finally uvn_flush(), that function is a beast and the clustered page out
alters the tree as well and so any pageout requires a restart like the
PG_BUSY sleep.
For reference here are the GENERIC.MP build times for make -j 16 and
make -j 32 on my amd64 box:
Before:
build with 16 jobs (smt = 0)
run 1 98.94 real 773.49 user 468.25 sys
run 2 98.31 real 773.19 user 462.84 sys
run 3 98.49 real 771.10 user 466.63 sys
run 4 98.60 real 773.76 user 464.28 sys
run 5 98.52 real 770.54 user 468.98 sys
avg over 5 runs: 98.572 real 772.416 user 466.196 sys
build with 32 jobs (smt = 1)
run 1 102.44 real 1001.73 user 1471.51 sys
run 2 102.56 real 999.52 user 1474.85 sys
run 3 102.23 real 999.06 user 1467.81 sys
run 4 102.30 real 1000.40 user 1468.36 sys
run 5 102.44 real 1001.60 user 1474.12 sys
avg over 5 runs: 102.394 real 1000.46 user 1471.33 sys
With diff:
build with 16 jobs (smt = 0)
run 1 95.79 real 769.67 user 436.77 sys
run 2 95.97 real 770.48 user 435.73 sys
run 3 95.73 real 769.60 user 436.77 sys
run 4 95.65 real 769.33 user 433.68 sys
run 5 96.29 real 770.49 user 435.78 sys
avg over 5 runs: 95.886 real 769.914 user 435.746 sys
build with 32 jobs (smt = 1)
run 1 93.70 real 1018.68 user 1250.32 sys
run 2 93.94 real 1014.58 user 1261.76 sys
run 3 93.89 real 1015.09 user 1260.52 sys
run 4 93.65 real 1016.37 user 1253.42 sys
run 5 94.04 real 1014.79 user 1262.75 sys
avg over 5 runs: 93.844 real 1015.9 user 1257.75 sys
With the diff and 16 jobs the build time is about 3sec fater and around
30sec (~7%) of system time is safed.
For 32 jobs on 32 CPUS the results are even better. Realtime drops by
9sec (~10%) and system time drops by >200sec (>15%).
uvn_flush() is a big abuser of the page queue lock and so making that
loop better reduces contention and therefor helps to reduce spin time on
one of the busiest mutexes in the system. The page queue lock usage in
uvn_flush() has a lot of bad smell. This is something to look into in a
future diff.
--
:wq Claudio
? uvm/udiff-1
Index: uvm/uvm_aobj.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
diff -u -p -r1.122 uvm_aobj.c
--- uvm/uvm_aobj.c 11 Feb 2026 22:34:40 -0000 1.122
+++ uvm/uvm_aobj.c 27 Mar 2026 12:28:08 -0000
@@ -858,8 +858,7 @@ boolean_t
uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *) uobj;
- struct vm_page *pg;
- voff_t curoff;
+ struct vm_page *pg, *npg;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
KASSERT(rw_write_held(uobj->vmobjlock));
@@ -885,23 +884,15 @@ uao_flush(struct uvm_object *uobj, voff_
return TRUE;
}
- curoff = start;
- for (;;) {
- if (curoff < stop) {
- pg = uvm_pagelookup(uobj, curoff);
- curoff += PAGE_SIZE;
- if (pg == NULL)
- continue;
- } else {
- break;
- }
-
+ again:
+ for (pg = uvm_pagerangefirst(uobj, start, stop); pg != NULL; pg = npg) {
+ npg = uvm_pagerangenext(pg, stop);
/* Make sure page is unbusy, else wait for it. */
if (pg->pg_flags & PG_BUSY) {
+ start = pg->offset; /* loop back to us */
uvm_pagewait(pg, uobj->vmobjlock, "uaoflsh");
rw_enter(uobj->vmobjlock, RW_WRITE);
- curoff -= PAGE_SIZE;
- continue;
+ goto again;
}
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
Index: uvm/uvm_km.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
diff -u -p -r1.159 uvm_km.c
--- uvm/uvm_km.c 13 Nov 2025 10:55:51 -0000 1.159
+++ uvm/uvm_km.c 27 Mar 2026 12:28:08 -0000
@@ -247,10 +247,9 @@ uvm_km_suballoc(struct vm_map *map, vadd
void
uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva)
{
- const voff_t start = startva - vm_map_min(kernel_map);
- const voff_t end = endva - vm_map_min(kernel_map);
- struct vm_page *pp;
- voff_t curoff;
+ voff_t start = startva - vm_map_min(kernel_map);
+ const voff_t stop = endva - vm_map_min(kernel_map);
+ struct vm_page *pp, *npp;
int slot;
int swpgonlydelta = 0;
@@ -258,17 +257,18 @@ uvm_km_pgremove(struct uvm_object *uobj,
KASSERT(rw_write_held(uobj->vmobjlock));
pmap_remove(pmap_kernel(), startva, endva);
- for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) {
- pp = uvm_pagelookup(uobj, curoff);
- if (pp && pp->pg_flags & PG_BUSY) {
+ again:
+ for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL; pp = npp) {
+ npp = uvm_pagerangenext(pp, stop);
+ if (pp->pg_flags & PG_BUSY) {
+ start = pp->offset; /* loop back to us */
uvm_pagewait(pp, uobj->vmobjlock, "km_pgrm");
rw_enter(uobj->vmobjlock, RW_WRITE);
- curoff -= PAGE_SIZE; /* loop back to us */
- continue;
+ goto again;
}
/* free the swap slot, then the page */
- slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT);
+ slot = uao_dropswap(uobj, pp->offset >> PAGE_SHIFT);
if (pp != NULL) {
uvm_pagefree(pp);
Index: uvm/uvm_object.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_object.c,v
diff -u -p -r1.28 uvm_object.c
--- uvm/uvm_object.c 10 Dec 2025 08:38:18 -0000 1.28
+++ uvm/uvm_object.c 27 Mar 2026 12:28:08 -0000
@@ -190,15 +190,13 @@ error:
* => caller must pass page-aligned start and end values
*/
void
-uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t end)
+uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t stop)
{
struct vm_page *pg;
- off_t offset;
rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK);
- for (offset = start; offset < end; offset += PAGE_SIZE) {
- pg = uvm_pagelookup(uobj, offset);
-
+ for (pg = uvm_pagerangefirst(uobj, start, stop); pg != NULL;
+ pg = uvm_pagerangenext(pg, stop)) {
KASSERT(pg != NULL);
KASSERT(!(pg->pg_flags & PG_RELEASED));
Index: uvm/uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
diff -u -p -r1.188 uvm_page.c
--- uvm/uvm_page.c 11 Feb 2026 22:34:41 -0000 1.188
+++ uvm/uvm_page.c 27 Mar 2026 12:28:08 -0000
@@ -1219,6 +1219,36 @@ uvm_pagelookup(struct uvm_object *obj, v
return (pg);
}
+struct vm_page *
+uvm_pagerangefirst(struct uvm_object *obj, voff_t start, voff_t stop)
+{
+ /* XXX if stack is too much, handroll */
+ struct vm_page p, *pg;
+
+ p.offset = start;
+ pg = RBT_NFIND(uvm_objtree, &obj->memt, &p);
+ if (pg == NULL || pg->offset >= stop)
+ return (NULL);
+
+ KASSERT(obj->uo_npages != 0);
+ KASSERT((pg->pg_flags & PG_RELEASED) == 0 ||
+ (pg->pg_flags & PG_BUSY) != 0);
+ return (pg);
+}
+
+struct vm_page *
+uvm_pagerangenext(struct vm_page *prev, voff_t stop)
+{
+ struct vm_page *pg;
+
+ pg = RBT_NEXT(uvm_objtree, prev);
+ if (pg == NULL || pg->offset >= stop)
+ return (NULL);
+ KASSERT((pg->pg_flags & PG_RELEASED) == 0 ||
+ (pg->pg_flags & PG_BUSY) != 0);
+ return (pg);
+}
+
/*
* uvm_pagewire: wire the page, thus removing it from the daemon's grasp
*/
Index: uvm/uvm_page.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.h,v
diff -u -p -r1.73 uvm_page.h
--- uvm/uvm_page.h 10 Mar 2025 18:54:38 -0000 1.73
+++ uvm/uvm_page.h 27 Mar 2026 12:28:08 -0000
@@ -228,6 +228,8 @@ void uvm_pageclean(struct vm_page *);
void uvm_pagefree(struct vm_page *);
void uvm_page_unbusy(struct vm_page **, int);
struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t);
+struct vm_page *uvm_pagerangefirst(struct uvm_object *, voff_t, voff_t);
+struct vm_page *uvm_pagerangenext(struct vm_page *, voff_t);
void uvm_pageunwire(struct vm_page *);
void uvm_pagewait(struct vm_page *, struct rwlock *, const char *);
void uvm_pagewire(struct vm_page *);
Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
diff -u -p -r1.151 uvm_vnode.c
--- uvm/uvm_vnode.c 29 Dec 2025 16:07:14 -0000 1.151
+++ uvm/uvm_vnode.c 27 Mar 2026 12:28:08 -0000
@@ -583,11 +583,10 @@ boolean_t
uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
- struct vm_page *pp, *ptmp;
+ struct vm_page *pp, *npp, *ptmp;
struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
int npages, result, lcv;
boolean_t retval, need_iosync, needs_clean;
- voff_t curoff;
KASSERT(rw_write_held(uobj->vmobjlock));
@@ -614,19 +613,19 @@ uvn_flush(struct uvm_object *uobj, voff_
*/
if ((flags & PGO_CLEANIT) != 0) {
KASSERT(uobj->pgops->pgo_mk_pcluster != 0);
- for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) {
- if ((pp = uvm_pagelookup(uobj, curoff)) != NULL)
- atomic_clearbits_int(&pp->pg_flags,
- PG_CLEANCHK);
+ for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL;
+ pp = uvm_pagerangenext(pp, stop)) {
+ atomic_clearbits_int(&pp->pg_flags, PG_CLEANCHK);
}
}
ppsp = NULL; /* XXX: shut up gcc */
uvm_lock_pageq();
+ again:
/* locked: both page queues */
- for (curoff = start; curoff < stop; curoff += PAGE_SIZE) {
- if ((pp = uvm_pagelookup(uobj, curoff)) == NULL)
- continue;
+ for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL; pp = npp) {
+ npp = uvm_pagerangenext(pp, stop);
+
/*
* handle case where we do not need to clean page (either
* because we are not clean or because page is not dirty or
@@ -668,13 +667,13 @@ uvn_flush(struct uvm_object *uobj, voff_
uvm_lock_pageq();
} else if (flags & PGO_FREE) {
if (pp->pg_flags & PG_BUSY) {
+ start = pp->offset;
uvm_unlock_pageq();
uvm_pagewait(pp, uobj->vmobjlock,
"uvn_flsh");
rw_enter(uobj->vmobjlock, RW_WRITE);
uvm_lock_pageq();
- curoff -= PAGE_SIZE;
- continue;
+ goto again;
} else {
pmap_page_protect(pp, PROT_NONE);
/* dequeue to prevent lock recursion */
@@ -696,6 +695,7 @@ ReTry:
* let uvm_pager_put attempted a clustered page out.
* note: locked: page queues.
*/
+ start = pp->offset + PAGE_SIZE;
atomic_setbits_int(&pp->pg_flags, PG_BUSY);
UVM_PAGE_OWN(pp, "uvn_flush");
pmap_page_protect(pp, PROT_READ);
@@ -803,6 +803,7 @@ ReTry:
}
} /* end of "lcv" for loop */
+ goto again;
} /* end of "pp" for loop */
improve UVM performance on unmap