From: Martin Pieuchot Subject: Consolidate building & cleaning swap clusters To: bluhm@openbsd.org, sthen@openbsd.org, tb@openbsd.org Cc: tech@openbsd.org Date: Fri, 5 Dec 2025 17:25:58 +0100 Diff below is a cleanup to untangle swapping code from the "pager layer". It also simplifies error handling in the page daemon. This is a necessary step towards a more efficient swapping process. It includes: - Make the page daemon call uvm_swap_put() directly with the cluster it just built - Let it call uvm_swap_free() for transient failure to unobfuscate the code (symmetry with uvm_swap_alloc()) - Rename uvm_aio_aiodone_pages() into uvm_swap_dropcluster() and consolidate it with the logic from uvm_pager_dropcluster() that applies to swap backed pages. - Use uvm_swap_dropcluster() instead of uvm_pager_dropcluster() to clean up clusters built by the page daemon. Note that with the current page daemon design, pages are never freed at this stage. - Simplify uvm_pager_put() and uvm_pager_dropcluster() by removing all swap-related code. These functions are now only used by vnode-backed pages. - Clean all pages related to a given UVM object in uvm_pager_dropcluster(). Since the page daemon is using an iterator to select which page needs to be recycled from the inactive LRU there is no real reason to return the last one. - Get rid of custom & ugly error cleanings after uvm_pager_put() ok? Index: uvm/uvm_pager.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.c,v diff -u -p -r1.95 uvm_pager.c --- uvm/uvm_pager.c 13 Nov 2025 11:06:13 -0000 1.95 +++ uvm/uvm_pager.c 5 Dec 2025 15:41:28 -0000 @@ -448,22 +448,17 @@ uvm_mk_pcluster(struct uvm_object *uobj, * possible. * * => page queues must be locked by caller - * => if page is not swap-backed, then "uobj" points to the object - * backing it. - * => if page is swap-backed, then "uobj" should be NULL. + * => "uobj" points to the object backing it. * => "pg" should be PG_BUSY (by caller), and !PG_CLEAN - * for swap-backed memory, "pg" can be NULL if there is no page - * of interest [sometimes the case for the pagedaemon] * => "ppsp_ptr" should point to an array of npages vm_page pointers * for possible cluster building - * => flags (first two for non-swap-backed pages) + * => flags * PGO_ALLPAGES: all pages in uobj are valid targets * PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets * PGO_SYNCIO: do SYNC I/O (no async) * PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O * PGO_FREE: tell the aio daemon to free pages in the async case. - * => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range - * if (!uobj) start is the (daddr_t) of the starting swapblk + * => start/stop: if !PGO_ALLPAGES limit targets to this range * => return state: * 1. we return the VM_PAGER status code of the pageout * 2. we return with the page queues unlocked @@ -480,40 +475,24 @@ uvm_pager_put(struct uvm_object *uobj, s voff_t start, voff_t stop) { int result; - daddr_t swblk; struct vm_page **ppsp = *ppsp_ptr; /* - * note that uobj is null if we are doing a swap-backed pageout. - * note that uobj is !null if we are doing normal object pageout. * note that the page queues must be locked to cluster. */ - if (uobj) { /* if !swap-backed */ - /* - * attempt to build a cluster for pageout using its - * make-put-cluster function (if it has one). - */ - if (uobj->pgops->pgo_mk_pcluster) { - ppsp = uobj->pgops->pgo_mk_pcluster(uobj, ppsp, - npages, pg, flags, start, stop); - *ppsp_ptr = ppsp; /* update caller's pointer */ - } else { - ppsp[0] = pg; - *npages = 1; - } + KASSERT(uobj != NULL && pg != NULL); - swblk = 0; /* XXX: keep gcc happy */ + /* + * attempt to build a cluster for pageout using its + * make-put-cluster function (if it has one). + */ + if (uobj->pgops->pgo_mk_pcluster) { + ppsp = uobj->pgops->pgo_mk_pcluster(uobj, ppsp, + npages, pg, flags, start, stop); + *ppsp_ptr = ppsp; /* update caller's pointer */ } else { - /* - * for swap-backed pageout, the caller (the pagedaemon) has - * already built the cluster for us. the starting swap - * block we are writing to has been passed in as "start." - * "pg" could be NULL if there is no page we are especially - * interested in (in which case the whole cluster gets dropped - * in the event of an error or a sync "done"). - */ - swblk = start; - /* ppsp and npages should be ok */ + ppsp[0] = pg; + *npages = 1; } /* now that we've clustered we can unlock the page queues */ @@ -523,12 +502,7 @@ uvm_pager_put(struct uvm_object *uobj, s * now attempt the I/O. if we have a failure and we are * clustered, we will drop the cluster and try again. */ - if (uobj) { - result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags); - } else { - /* XXX daddr_t -> int */ - result = uvm_swap_put(swblk, ppsp, *npages, flags); - } + result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags); /* * we have attempted the I/O. @@ -545,63 +519,18 @@ uvm_pager_put(struct uvm_object *uobj, s if (result == VM_PAGER_PEND || result == VM_PAGER_OK) { if (result == VM_PAGER_OK && (flags & PGO_PDFREECLUST)) { /* drop cluster */ - if (*npages > 1 || pg == NULL) - uvm_pager_dropcluster(uobj, pg, ppsp, npages, - PGO_PDFREECLUST); + uvm_pager_dropcluster(uobj, ppsp, npages, + PGO_PDFREECLUST); } return (result); } /* * a pager error occurred (even after dropping the cluster, if there - * was one). give up! the caller only has one page ("pg") - * to worry about. + * was one). */ - if (*npages > 1 || pg == NULL) { - uvm_pager_dropcluster(uobj, pg, ppsp, npages, PGO_REALLOCSWAP); + uvm_pager_dropcluster(uobj, ppsp, npages, 0); - /* - * for failed swap-backed pageouts with a "pg", - * we need to reset pg's swslot to either: - * "swblk" (for transient errors, so we can retry), - * or 0 (for hard errors). - */ - if (uobj == NULL) { - if (pg != NULL) { - if (pg->pg_flags & PQ_ANON) { - rw_enter(pg->uanon->an_lock, RW_WRITE); - pg->uanon->an_swslot = 0; - rw_exit(pg->uanon->an_lock); - } else { - rw_enter(pg->uobject->vmobjlock, RW_WRITE); - uao_set_swslot(pg->uobject, - pg->offset >> PAGE_SHIFT, 0); - rw_exit(pg->uobject->vmobjlock); - } - } - /* - * for transient failures, free all the swslots - */ - if (result == VM_PAGER_AGAIN) { - /* XXX daddr_t -> int */ - uvm_swap_free(swblk, *npages); - } else { - /* - * for hard errors on swap-backed pageouts, - * mark the swslots as bad. note that we do not - * free swslots that we mark bad. - */ - /* XXX daddr_t -> int */ - uvm_swap_markbad(swblk, *npages); - } - } - } - - /* - * a pager error occurred (even after dropping the cluster, if there - * was one). give up! the caller only has one page ("pg") - * to worry about. - */ return result; } @@ -610,51 +539,25 @@ uvm_pager_put(struct uvm_object *uobj, s * got an error, or, if PGO_PDFREECLUST we are un-busying the * cluster pages on behalf of the pagedaemon). * - * => uobj, if non-null, is a non-swap-backed object + * => uobj is a non-swap-backed object * => page queues are not locked * => pg is our page of interest (the one we clustered around, can be null) * => ppsp/npages is our current cluster * => flags: PGO_PDFREECLUST: pageout was a success: un-busy cluster * pages on behalf of the pagedaemon. - * PGO_REALLOCSWAP: drop previously allocated swap slots for - * clustered swap-backed pages (except for "pg" if !NULL) - * "swblk" is the start of swap alloc (e.g. for ppsp[0]) - * [only meaningful if swap-backed (uobj == NULL)] */ - void -uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg, - struct vm_page **ppsp, int *npages, int flags) +uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page **ppsp, + int *npages, int flags) { int lcv; - KASSERT(uobj == NULL || rw_write_held(uobj->vmobjlock)); + KASSERT(rw_write_held(uobj->vmobjlock)); - /* drop all pages but "pg" */ for (lcv = 0 ; lcv < *npages ; lcv++) { - /* skip "pg" or empty slot */ - if (ppsp[lcv] == pg || ppsp[lcv] == NULL) + /* skip empty slot */ + if (ppsp[lcv] == NULL) continue; - - /* - * Note that PQ_ANON bit can't change as long as we are holding - * the PG_BUSY bit (so there is no need to lock the page - * queues to test it). - */ - if (!uobj) { - if (ppsp[lcv]->pg_flags & PQ_ANON) { - rw_enter(ppsp[lcv]->uanon->an_lock, RW_WRITE); - if (flags & PGO_REALLOCSWAP) - /* zap swap block */ - ppsp[lcv]->uanon->an_swslot = 0; - } else { - rw_enter(ppsp[lcv]->uobject->vmobjlock, - RW_WRITE); - if (flags & PGO_REALLOCSWAP) - uao_set_swslot(ppsp[lcv]->uobject, - ppsp[lcv]->offset >> PAGE_SHIFT, 0); - } - } /* did someone want the page while we had it busy-locked? */ if (ppsp[lcv]->pg_flags & PG_WANTED) { @@ -686,14 +589,6 @@ uvm_pager_dropcluster(struct uvm_object pmap_clear_modify(ppsp[lcv]); atomic_setbits_int(&ppsp[lcv]->pg_flags, PG_CLEAN); } - - /* if anonymous cluster, unlock object and move on */ - if (!uobj) { - if (ppsp[lcv]->pg_flags & PQ_ANON) - rw_exit(ppsp[lcv]->uanon->an_lock); - else - rw_exit(ppsp[lcv]->uobject->vmobjlock); - } } } @@ -718,39 +613,25 @@ uvm_aio_biodone(struct buf *bp) mtx_leave(&uvm.aiodoned_lock); } -void -uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, boolean_t write, - int error) +/* + * uvm_swap_dropcluster: drop a cluster we have built (because we + * got an error, or, we are un-busying the cluster pages on behalf + * of the pagedaemon. + */ +int +uvm_swap_dropcluster(struct vm_page **pgs, int npages, int error) { struct vm_page *pg; - struct rwlock *slock; - boolean_t swap; - int i, swslot; - - slock = NULL; - pg = pgs[0]; - swap = (pg->uanon != NULL && pg->uobject == NULL) || - (pg->pg_flags & PQ_AOBJ) != 0; - - KASSERT(swap); - KASSERT(write); - - if (error) { - if (pg->uobject != NULL) { - swslot = uao_find_swslot(pg->uobject, - pg->offset >> PAGE_SHIFT); - } else { - swslot = pg->uanon->an_swslot; - } - KASSERT(swslot); - } + struct rwlock *slock = NULL; + int i, slot = -1; for (i = 0; i < npages; i++) { int anon_disposed = 0; pg = pgs[i]; KASSERT((pg->pg_flags & PG_FAKE) == 0); - + KASSERT((pg->uanon != NULL && pg->uobject == NULL) || + (pg->pg_flags & PQ_AOBJ) != 0); /* * lock each page's object (or anon) individually since * each page may need a different lock. @@ -760,14 +641,37 @@ uvm_aio_aiodone_pages(struct vm_page **p } else { slock = pg->uanon->an_lock; } + rw_enter(slock, RW_WRITE); + if (error) { + /* for hard failures return the first slot. */ + if (error != ENOMEM && i == 0) { + if (pg->uobject != NULL) { + slot = uao_find_swslot(pg->uobject, + pg->offset >> PAGE_SHIFT); + } else { + slot = pg->uanon->an_swslot; + } + KASSERT(slot); + } + /* + * for failed swap-backed pageouts we need to + * reset pg's swslot to 0. + */ + if (pg->uobject != NULL) + uao_set_swslot(pg->uobject, + pg->offset >> PAGE_SHIFT, 0); + else + pg->uanon->an_swslot = 0; + } + anon_disposed = (pg->pg_flags & PG_RELEASED) != 0; KASSERT(!anon_disposed || pg->uobject != NULL || pg->uanon->an_ref == 0); /* - * if this was a successful write, - * mark the page PG_CLEAN. + * if we are operating on behalf of the pagedaemon and + * we had a successful pageout update the page! */ if (!error) { pmap_clear_reference(pg); @@ -775,20 +679,16 @@ uvm_aio_aiodone_pages(struct vm_page **p atomic_setbits_int(&pg->pg_flags, PG_CLEAN); } - /* - * unlock everything for this page now. - */ if (pg->uobject == NULL && anon_disposed) { uvm_anon_release(pg->uanon); + continue; } else { uvm_page_unbusy(&pg, 1); - rw_exit(slock); } + rw_exit(slock); } - if (error) { - uvm_swap_markbad(swslot, npages); - } + return slot; } /* @@ -800,15 +700,13 @@ uvm_aio_aiodone(struct buf *bp) { int npages = bp->b_bufsize >> PAGE_SHIFT; struct vm_page *pgs[MAXPHYS >> PAGE_SHIFT]; - int i, error; - boolean_t write; + int i, error, slot; KASSERT(npages <= MAXPHYS >> PAGE_SHIFT); + KASSERT((bp->b_flags & B_READ) == 0); splassert(IPL_BIO); - error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0; - write = (bp->b_flags & B_READ) == 0; - + error = (bp->b_flags & B_ERROR) ? EIO : 0; for (i = 0; i < npages; i++) pgs[i] = uvm_atopg((vaddr_t)bp->b_data + ((vsize_t)i << PAGE_SHIFT)); @@ -823,8 +721,15 @@ uvm_aio_aiodone(struct buf *bp) } #endif /* UVM_SWAP_ENCRYPT */ - uvm_aio_aiodone_pages(pgs, npages, write, error); - + slot = uvm_swap_dropcluster(pgs, npages, error); + /* + * for hard errors on swap-backed pageouts, mark the swslots as + * bad. note that we do not free swslots that we mark bad. + */ + if (error) { + /* XXX daddr_t -> int */ + uvm_swap_markbad(slot, npages); + } #ifdef UVM_SWAP_ENCRYPT freed: #endif Index: uvm/uvm_pager.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.h,v diff -u -p -r1.33 uvm_pager.h --- uvm/uvm_pager.h 12 Oct 2021 07:38:22 -0000 1.33 +++ uvm/uvm_pager.h 5 Dec 2025 15:37:44 -0000 @@ -110,7 +110,6 @@ struct uvm_pagerops { #define PGO_DOACTCLUST 0x020 /* flag to mk_pcluster to include active */ #define PGO_LOCKED 0x040 /* fault data structures are locked [get] */ #define PGO_PDFREECLUST 0x080 /* daemon's free cluster flag [uvm_pager_put] */ -#define PGO_REALLOCSWAP 0x100 /* reallocate swap area [pager_dropcluster] */ #define PGO_NOWAIT 0x200 /* do not wait for inode lock */ /* page we are not interested in getting */ @@ -120,8 +119,8 @@ struct uvm_pagerops { * prototypes */ -void uvm_pager_dropcluster(struct uvm_object *, struct vm_page *, - struct vm_page **, int *, int); +void uvm_pager_dropcluster(struct uvm_object *, struct vm_page **, + int *, int); void uvm_pager_init(void); int uvm_pager_put(struct uvm_object *, struct vm_page *, struct vm_page ***, int *, int, voff_t, voff_t); Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -r1.139 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 3 Dec 2025 09:47:44 -0000 1.139 +++ uvm/uvm_pdaemon.c 5 Dec 2025 15:37:44 -0000 @@ -477,15 +477,14 @@ swapcluster_add(struct swapcluster *swc, return 0; } -void +int swapcluster_flush(struct swapcluster *swc) { - int slot; - int nused; - int nallocated; + int slot, nused, nallocated; + int result; if (swc->swc_slot == 0) - return; + return 0; // XXX KASSERT(swc->swc_nused <= swc->swc_nallocated); slot = swc->swc_slot; @@ -494,6 +493,24 @@ swapcluster_flush(struct swapcluster *sw if (nused < nallocated) uvm_swap_free(slot + nused, nallocated - nused); + + uvmexp.pdpageouts++; + result = uvm_swap_put(slot, swc->swc_pages, nused, 0); + if (result != VM_PAGER_PEND) { + KASSERT(result == VM_PAGER_AGAIN); + uvm_swap_dropcluster(swc->swc_pages, nused, ENOMEM); + /* for transient failures, free all the swslots */ + /* XXX daddr_t -> int */ + uvm_swap_free(slot, nused); + } + + /* + * zero swslot to indicate that we are + * no longer building a swap-backed cluster. + */ + swapcluster_init(swc); + + return result; } static inline int @@ -589,7 +606,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc * struct rwlock *slock; struct vm_anon *anon; boolean_t swap_backed; - vaddr_t start; int dirtyreacts; /* @@ -822,155 +838,40 @@ uvmpd_scan_inactive(struct uvm_pmalloc * * now consider doing the pageout. * * for swap-backed pages, we do the pageout if we have either - * filled the cluster (in which case (swnpages == swcpages) or - * run out of pages (p == NULL). + * filled the cluster or run out of pages. * * for object pages, we always do the pageout. */ + uvmexp.pdpageouts++; if (swap_backed) { + uvm_unlock_pageq(); /* starting I/O now... set up for it */ npages = swc.swc_nused; - ppsp = swc.swc_pages; - /* for swap-backed pages only */ - start = (vaddr_t) swc.swc_slot; - - /* if this is final pageout we could have a few - * extra swap blocks */ - swapcluster_flush(&swc); + result = swapcluster_flush(&swc); } else { /* normal object pageout */ ppsp = pps; - npages = sizeof(pps) / sizeof(struct vm_page *); - /* not looked at because PGO_ALLPAGES is set */ - start = 0; - } + npages = nitems(pps); - /* - * now do the pageout. - * - * for swap_backed pages we have already built the cluster. - * for !swap_backed pages, uvm_pager_put will call the object's - * "make put cluster" function to build a cluster on our behalf. - * - * we pass the PGO_PDFREECLUST flag to uvm_pager_put to instruct - * it to free the cluster pages for us on a successful I/O (it - * always does this for un-successful I/O requests). this - * allows us to do clustered pageout without having to deal - * with cluster pages at this level. - * - * note locking semantics of uvm_pager_put with PGO_PDFREECLUST: - * IN: locked: page queues - * OUT: locked: - * !locked: pageqs - */ - - uvmexp.pdpageouts++; - result = uvm_pager_put(swap_backed ? NULL : uobj, p, - &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0); - - /* - * if we did i/o to swap, zero swslot to indicate that we are - * no longer building a swap-backed cluster. - */ - if (swap_backed) - swapcluster_init(&swc); /* done with this cluster */ + /* + * uvm_pager_put() will call the object's "make put + * cluster" function to build a cluster on our behalf. + * we pass the PGO_PDFREECLUST flag to uvm_pager_put() + * to instruct it to free the cluster pages for us on + * a successful I/O (it always does this for un- + * successful I/O requests). this allows us to do + * clustered pageout without having to deal with + * cluster pages at this level. + */ + result = uvm_pager_put(uobj, p, &ppsp, &npages, + PGO_ALLPAGES|PGO_PDFREECLUST, 0, 0); + rw_exit(slock); + } - /* - * first, we check for VM_PAGER_PEND which means that the - * async I/O is in progress and the async I/O done routine - * will clean up after us. in this case we move on to the - * next page. - */ + uvm_lock_pageq(); if (result == VM_PAGER_PEND) { atomic_add_int(&uvmexp.paging, npages); - uvm_lock_pageq(); uvmexp.pdpending++; - continue; - } - - /* clean up "p" if we have one */ - if (p) { - /* - * the I/O request to "p" is done and uvm_pager_put - * has freed any cluster pages it may have allocated - * during I/O. all that is left for us to do is - * clean up page "p" (which is still PG_BUSY). - * - * our result could be one of the following: - * VM_PAGER_OK: successful pageout - * - * VM_PAGER_AGAIN: tmp resource shortage, we skip - * to next page - * VM_PAGER_{FAIL,ERROR,BAD}: an error. we - * "reactivate" page to get it out of the way (it - * will eventually drift back into the inactive - * queue for a retry). - * VM_PAGER_UNLOCK: should never see this as it is - * only valid for "get" operations - */ - - /* relock p's object: page queues not lock yet, so - * no need for "try" */ - - /* !swap_backed case: already locked... */ - if (swap_backed) { - rw_enter(slock, RW_WRITE); - } - -#ifdef DIAGNOSTIC - if (result == VM_PAGER_UNLOCK) - panic("pagedaemon: pageout returned " - "invalid 'unlock' code"); -#endif - - /* handle PG_WANTED now */ - if (p->pg_flags & PG_WANTED) - wakeup(p); - - atomic_clearbits_int(&p->pg_flags, PG_BUSY|PG_WANTED); - UVM_PAGE_OWN(p, NULL); - - /* released during I/O? Can only happen for anons */ - if (p->pg_flags & PG_RELEASED) { - KASSERT(anon != NULL); - /* - * remove page so we can get nextpg, - * also zero out anon so we don't use - * it after the free. - */ - anon->an_page = NULL; - p->uanon = NULL; - - uvm_anfree(anon); /* kills anon */ - pmap_page_protect(p, PROT_NONE); - anon = NULL; - uvm_lock_pageq(); - /* dequeue first to prevent lock recursion */ - uvm_pagedequeue(p); - /* free released page */ - uvm_pagefree(p); - } else { /* page was not released during I/O */ - uvm_lock_pageq(); - if (result != VM_PAGER_OK) { - /* pageout was a failure... */ - if (result != VM_PAGER_AGAIN) - uvm_pageactivate(p); - pmap_clear_reference(p); - } else { - /* pageout was a success... */ - pmap_clear_reference(p); - pmap_clear_modify(p); - atomic_setbits_int(&p->pg_flags, - PG_CLEAN); - } - } - rw_exit(slock); - } else { - /* - * lock page queues here just so they're always locked - * at the end of the loop. - */ - uvm_lock_pageq(); } } TAILQ_REMOVE(pglst, &iter, pageq); Index: uvm/uvm_swap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.c,v diff -u -p -r1.176 uvm_swap.c --- uvm/uvm_swap.c 20 Sep 2025 13:53:36 -0000 1.176 +++ uvm/uvm_swap.c 5 Dec 2025 15:37:44 -0000 @@ -1831,8 +1831,7 @@ uvm_swap_io(struct vm_page **pps, int st /* dispose of pages we dont use anymore */ opages = npages; - uvm_pager_dropcluster(NULL, NULL, pps, &opages, - PGO_PDFREECLUST); + uvm_swap_dropcluster(pps, opages, 0); kva = bouncekva; } Index: uvm/uvm_swap.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_swap.h,v diff -u -p -r1.20 uvm_swap.h --- uvm/uvm_swap.h 27 Oct 2023 19:18:53 -0000 1.20 +++ uvm/uvm_swap.h 5 Dec 2025 15:37:44 -0000 @@ -37,6 +37,7 @@ #ifdef _KERNEL +int uvm_swap_dropcluster(struct vm_page **, int, int); int uvm_swap_get(struct vm_page *, int, int); int uvm_swap_put(int, struct vm_page **, int, int); int uvm_swap_alloc(int *, boolean_t);