From: Claudio Jeker Subject: improve UVM performance on unmap To: tech@openbsd.org Date: Fri, 27 Mar 2026 14:03:44 +0100 While working on prometheus I had a version where the daemon did a munmap(2) and mmap(2) for every write to that memory map. This resulted in high spin and system time. For curiosity I used dtrace kprofile and it pointed the problem straight at a very inefficent use of uvm_pagelookup(), e.g.: uvm_objtree_RBT_COMPARE+0xb uvm_pagelookup+0x3e uvn_flush+0x17e uvn_detach+0x7e uvm_unmap_detach+0xf1 sys_munmap+0x185 syscall+0x5f9 Xsyscall+0x128 kernel So unv_flush() is very inefficent at unmapping the file and burns most of its time in uvm_pagelookup(). There are a few UVM functions that roughly do: for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { pp = uvm_pagelookup(uobj, curoff); if (pp == NULL) contine; ... } This is a very expensive way to write something that is almost an RB_FOREACH(). I went and implemented uvm_pagerangefirst() and uvm_pagerangenext() to build an iterator that is more efficent. As usual with these iterators if the page is removed from the RB tree the code needs to prefetch the next element similar to how RB_FOREACH_SAFE() works. On top of that we have the following issue: If there is a sleep point (which releases the object lock) prefeting is no longer enough. Once we release the lock someone else can modify the tree and the page we cached. So after such a sleep the loop needs to restarted with a new start point by calling uvm_pagerangefirst(). This is what uvm_km_pgremove() does for the PG_BUSY case. uao_flush() is very similar to uvm_km_pgremove() but was written in a very strange way. uvm_obj_unwire() is the most trivial conversion since it does not alter the RB tree at all. Finally uvn_flush(), that function is a beast and the clustered page out alters the tree as well and so any pageout requires a restart like the PG_BUSY sleep. For reference here are the GENERIC.MP build times for make -j 16 and make -j 32 on my amd64 box: Before: build with 16 jobs (smt = 0) run 1 98.94 real 773.49 user 468.25 sys run 2 98.31 real 773.19 user 462.84 sys run 3 98.49 real 771.10 user 466.63 sys run 4 98.60 real 773.76 user 464.28 sys run 5 98.52 real 770.54 user 468.98 sys avg over 5 runs: 98.572 real 772.416 user 466.196 sys build with 32 jobs (smt = 1) run 1 102.44 real 1001.73 user 1471.51 sys run 2 102.56 real 999.52 user 1474.85 sys run 3 102.23 real 999.06 user 1467.81 sys run 4 102.30 real 1000.40 user 1468.36 sys run 5 102.44 real 1001.60 user 1474.12 sys avg over 5 runs: 102.394 real 1000.46 user 1471.33 sys With diff: build with 16 jobs (smt = 0) run 1 95.79 real 769.67 user 436.77 sys run 2 95.97 real 770.48 user 435.73 sys run 3 95.73 real 769.60 user 436.77 sys run 4 95.65 real 769.33 user 433.68 sys run 5 96.29 real 770.49 user 435.78 sys avg over 5 runs: 95.886 real 769.914 user 435.746 sys build with 32 jobs (smt = 1) run 1 93.70 real 1018.68 user 1250.32 sys run 2 93.94 real 1014.58 user 1261.76 sys run 3 93.89 real 1015.09 user 1260.52 sys run 4 93.65 real 1016.37 user 1253.42 sys run 5 94.04 real 1014.79 user 1262.75 sys avg over 5 runs: 93.844 real 1015.9 user 1257.75 sys With the diff and 16 jobs the build time is about 3sec fater and around 30sec (~7%) of system time is safed. For 32 jobs on 32 CPUS the results are even better. Realtime drops by 9sec (~10%) and system time drops by >200sec (>15%). uvn_flush() is a big abuser of the page queue lock and so making that loop better reduces contention and therefor helps to reduce spin time on one of the busiest mutexes in the system. The page queue lock usage in uvn_flush() has a lot of bad smell. This is something to look into in a future diff. -- :wq Claudio ? uvm/udiff-1 Index: uvm/uvm_aobj.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v diff -u -p -r1.122 uvm_aobj.c --- uvm/uvm_aobj.c 11 Feb 2026 22:34:40 -0000 1.122 +++ uvm/uvm_aobj.c 27 Mar 2026 12:28:08 -0000 @@ -858,8 +858,7 @@ boolean_t uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) { struct uvm_aobj *aobj = (struct uvm_aobj *) uobj; - struct vm_page *pg; - voff_t curoff; + struct vm_page *pg, *npg; KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock)); @@ -885,23 +884,15 @@ uao_flush(struct uvm_object *uobj, voff_ return TRUE; } - curoff = start; - for (;;) { - if (curoff < stop) { - pg = uvm_pagelookup(uobj, curoff); - curoff += PAGE_SIZE; - if (pg == NULL) - continue; - } else { - break; - } - + again: + for (pg = uvm_pagerangefirst(uobj, start, stop); pg != NULL; pg = npg) { + npg = uvm_pagerangenext(pg, stop); /* Make sure page is unbusy, else wait for it. */ if (pg->pg_flags & PG_BUSY) { + start = pg->offset; /* loop back to us */ uvm_pagewait(pg, uobj->vmobjlock, "uaoflsh"); rw_enter(uobj->vmobjlock, RW_WRITE); - curoff -= PAGE_SIZE; - continue; + goto again; } switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) { Index: uvm/uvm_km.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_km.c,v diff -u -p -r1.159 uvm_km.c --- uvm/uvm_km.c 13 Nov 2025 10:55:51 -0000 1.159 +++ uvm/uvm_km.c 27 Mar 2026 12:28:08 -0000 @@ -247,10 +247,9 @@ uvm_km_suballoc(struct vm_map *map, vadd void uvm_km_pgremove(struct uvm_object *uobj, vaddr_t startva, vaddr_t endva) { - const voff_t start = startva - vm_map_min(kernel_map); - const voff_t end = endva - vm_map_min(kernel_map); - struct vm_page *pp; - voff_t curoff; + voff_t start = startva - vm_map_min(kernel_map); + const voff_t stop = endva - vm_map_min(kernel_map); + struct vm_page *pp, *npp; int slot; int swpgonlydelta = 0; @@ -258,17 +257,18 @@ uvm_km_pgremove(struct uvm_object *uobj, KASSERT(rw_write_held(uobj->vmobjlock)); pmap_remove(pmap_kernel(), startva, endva); - for (curoff = start ; curoff < end ; curoff += PAGE_SIZE) { - pp = uvm_pagelookup(uobj, curoff); - if (pp && pp->pg_flags & PG_BUSY) { + again: + for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL; pp = npp) { + npp = uvm_pagerangenext(pp, stop); + if (pp->pg_flags & PG_BUSY) { + start = pp->offset; /* loop back to us */ uvm_pagewait(pp, uobj->vmobjlock, "km_pgrm"); rw_enter(uobj->vmobjlock, RW_WRITE); - curoff -= PAGE_SIZE; /* loop back to us */ - continue; + goto again; } /* free the swap slot, then the page */ - slot = uao_dropswap(uobj, curoff >> PAGE_SHIFT); + slot = uao_dropswap(uobj, pp->offset >> PAGE_SHIFT); if (pp != NULL) { uvm_pagefree(pp); Index: uvm/uvm_object.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_object.c,v diff -u -p -r1.28 uvm_object.c --- uvm/uvm_object.c 10 Dec 2025 08:38:18 -0000 1.28 +++ uvm/uvm_object.c 27 Mar 2026 12:28:08 -0000 @@ -190,15 +190,13 @@ error: * => caller must pass page-aligned start and end values */ void -uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t end) +uvm_obj_unwire(struct uvm_object *uobj, voff_t start, voff_t stop) { struct vm_page *pg; - off_t offset; rw_enter(uobj->vmobjlock, RW_WRITE | RW_DUPOK); - for (offset = start; offset < end; offset += PAGE_SIZE) { - pg = uvm_pagelookup(uobj, offset); - + for (pg = uvm_pagerangefirst(uobj, start, stop); pg != NULL; + pg = uvm_pagerangenext(pg, stop)) { KASSERT(pg != NULL); KASSERT(!(pg->pg_flags & PG_RELEASED)); Index: uvm/uvm_page.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.c,v diff -u -p -r1.188 uvm_page.c --- uvm/uvm_page.c 11 Feb 2026 22:34:41 -0000 1.188 +++ uvm/uvm_page.c 27 Mar 2026 12:28:08 -0000 @@ -1219,6 +1219,36 @@ uvm_pagelookup(struct uvm_object *obj, v return (pg); } +struct vm_page * +uvm_pagerangefirst(struct uvm_object *obj, voff_t start, voff_t stop) +{ + /* XXX if stack is too much, handroll */ + struct vm_page p, *pg; + + p.offset = start; + pg = RBT_NFIND(uvm_objtree, &obj->memt, &p); + if (pg == NULL || pg->offset >= stop) + return (NULL); + + KASSERT(obj->uo_npages != 0); + KASSERT((pg->pg_flags & PG_RELEASED) == 0 || + (pg->pg_flags & PG_BUSY) != 0); + return (pg); +} + +struct vm_page * +uvm_pagerangenext(struct vm_page *prev, voff_t stop) +{ + struct vm_page *pg; + + pg = RBT_NEXT(uvm_objtree, prev); + if (pg == NULL || pg->offset >= stop) + return (NULL); + KASSERT((pg->pg_flags & PG_RELEASED) == 0 || + (pg->pg_flags & PG_BUSY) != 0); + return (pg); +} + /* * uvm_pagewire: wire the page, thus removing it from the daemon's grasp */ Index: uvm/uvm_page.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_page.h,v diff -u -p -r1.73 uvm_page.h --- uvm/uvm_page.h 10 Mar 2025 18:54:38 -0000 1.73 +++ uvm/uvm_page.h 27 Mar 2026 12:28:08 -0000 @@ -228,6 +228,8 @@ void uvm_pageclean(struct vm_page *); void uvm_pagefree(struct vm_page *); void uvm_page_unbusy(struct vm_page **, int); struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t); +struct vm_page *uvm_pagerangefirst(struct uvm_object *, voff_t, voff_t); +struct vm_page *uvm_pagerangenext(struct vm_page *, voff_t); void uvm_pageunwire(struct vm_page *); void uvm_pagewait(struct vm_page *, struct rwlock *, const char *); void uvm_pagewire(struct vm_page *); Index: uvm/uvm_vnode.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v diff -u -p -r1.151 uvm_vnode.c --- uvm/uvm_vnode.c 29 Dec 2025 16:07:14 -0000 1.151 +++ uvm/uvm_vnode.c 27 Mar 2026 12:28:08 -0000 @@ -583,11 +583,10 @@ boolean_t uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags) { struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; - struct vm_page *pp, *ptmp; + struct vm_page *pp, *npp, *ptmp; struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; int npages, result, lcv; boolean_t retval, need_iosync, needs_clean; - voff_t curoff; KASSERT(rw_write_held(uobj->vmobjlock)); @@ -614,19 +613,19 @@ uvn_flush(struct uvm_object *uobj, voff_ */ if ((flags & PGO_CLEANIT) != 0) { KASSERT(uobj->pgops->pgo_mk_pcluster != 0); - for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) { - if ((pp = uvm_pagelookup(uobj, curoff)) != NULL) - atomic_clearbits_int(&pp->pg_flags, - PG_CLEANCHK); + for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL; + pp = uvm_pagerangenext(pp, stop)) { + atomic_clearbits_int(&pp->pg_flags, PG_CLEANCHK); } } ppsp = NULL; /* XXX: shut up gcc */ uvm_lock_pageq(); + again: /* locked: both page queues */ - for (curoff = start; curoff < stop; curoff += PAGE_SIZE) { - if ((pp = uvm_pagelookup(uobj, curoff)) == NULL) - continue; + for (pp = uvm_pagerangefirst(uobj, start, stop); pp != NULL; pp = npp) { + npp = uvm_pagerangenext(pp, stop); + /* * handle case where we do not need to clean page (either * because we are not clean or because page is not dirty or @@ -668,13 +667,13 @@ uvn_flush(struct uvm_object *uobj, voff_ uvm_lock_pageq(); } else if (flags & PGO_FREE) { if (pp->pg_flags & PG_BUSY) { + start = pp->offset; uvm_unlock_pageq(); uvm_pagewait(pp, uobj->vmobjlock, "uvn_flsh"); rw_enter(uobj->vmobjlock, RW_WRITE); uvm_lock_pageq(); - curoff -= PAGE_SIZE; - continue; + goto again; } else { pmap_page_protect(pp, PROT_NONE); /* dequeue to prevent lock recursion */ @@ -696,6 +695,7 @@ ReTry: * let uvm_pager_put attempted a clustered page out. * note: locked: page queues. */ + start = pp->offset + PAGE_SIZE; atomic_setbits_int(&pp->pg_flags, PG_BUSY); UVM_PAGE_OWN(pp, "uvn_flush"); pmap_page_protect(pp, PROT_READ); @@ -803,6 +803,7 @@ ReTry: } } /* end of "lcv" for loop */ + goto again; } /* end of "pp" for loop */