From: Martin Pieuchot Subject: Re: Make pdaemon understandable To: Theo Buehler Cc: sthen@openbsd.org, bluhm@openbsd.org, tech@openbsd.org Date: Tue, 23 Dec 2025 12:54:34 +0100 On 22/12/25(Mon) 20:39, Theo Buehler wrote: > On Mon, Dec 22, 2025 at 12:11:47PM +0100, Martin Pieuchot wrote: > > Diff below has been generated with "-b" to be easily reviewable. > > Attached is the one I intend to commit which mostly includes indent > > changes. > > > > It moves the final pageout of swap cluster outside of the loop. This > > allows to remove the NULL checks and make the body of the loop readable. > > > > I intentionally didn't change anything *inside* the loop even if we now > > see incoherency. > > > > This reads mostly fine. One thing: > > > @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * > > > > /* Insert iterator. */ > > TAILQ_INSERT_AFTER(pglst, p, &iter, pageq); > > - for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) { > > - /* > > - * note that p can be NULL iff we have traversed the whole > > - * list and need to do one final swap-backed clustered pageout. > > - */ > > - uobj = NULL; > > - anon = NULL; > > - if (p) { > > + while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL) { > > I'm confused by this change. I would have expected a > > do { ... } while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL); > > loop. The first time around in the for loop things are applied to p > itself, not to nextpg as returned by the iterator. Thanks a lot. You're right, otherwise we skip the first page. Here's an updated version of the diff that keeps the for () loop for now. ok? Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -b -r1.143 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 18 Dec 2025 16:50:42 -0000 1.143 +++ uvm/uvm_pdaemon.c 23 Dec 2025 11:49:53 -0000 @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw int result; if (swc->swc_slot == 0) - return 0; // XXX + return 0; KASSERT(swc->swc_nused <= swc->swc_nallocated); slot = swc->swc_slot; @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* Insert iterator. */ TAILQ_INSERT_AFTER(pglst, p, &iter, pageq); - for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) { - /* - * note that p can be NULL iff we have traversed the whole - * list and need to do one final swap-backed clustered pageout. - */ - uobj = NULL; - anon = NULL; - if (p) { + for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) { /* * see if we've met our target */ @@ -648,16 +641,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc * (uvmexp.paging + swapcluster_nused(&swc) >= (shortage - freed))) || dirtyreacts == UVMPD_NUMDIRTYREACTS) { - if (swc.swc_slot == 0) { - /* exit now if no swap-i/o pending */ break; } - - /* set p to null to signal final swap i/o */ - p = NULL; - } - } - if (p) { /* if (we have a new page to consider) */ /* * we are below target and have a new page to consider. */ @@ -792,7 +777,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc * * of the page so that no one touches it while it is * in I/O. */ - swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0); atomic_setbits_int(&p->pg_flags, PG_BUSY); UVM_PAGE_OWN(p, "scan_inactive"); @@ -809,8 +793,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* start new cluster (if necessary) */ if (swapcluster_allocslots(&swc)) { - atomic_clearbits_int(&p->pg_flags, - PG_BUSY); + atomic_clearbits_int(&p->pg_flags, PG_BUSY); UVM_PAGE_OWN(p, NULL); dirtyreacts++; uvm_unlock_pageq(); @@ -822,8 +805,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* add block to cluster */ if (swapcluster_add(&swc, p)) { - atomic_clearbits_int(&p->pg_flags, - PG_BUSY); + atomic_clearbits_int(&p->pg_flags, PG_BUSY); UVM_PAGE_OWN(p, NULL); dirtyreacts++; uvm_unlock_pageq(); @@ -838,10 +820,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc * if (swc.swc_nused < swc.swc_nallocated) continue; } - } else { - /* if p == NULL we must be doing a last swap i/o */ - swap_backed = TRUE; - } /* * now consider doing the pageout. @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc * } } TAILQ_REMOVE(pglst, &iter, pageq); + + /* final swap-backed clustered pageout */ + if (swc.swc_slot > 0) { + uvm_unlock_pageq(); + npages = swc.swc_nused; + result = swapcluster_flush(&swc); + uvm_lock_pageq(); + if (result == VM_PAGER_PEND) { + atomic_add_int(&uvmexp.paging, npages); + uvmexp.pdpending++; + } + } return freed; } Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -r1.143 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 18 Dec 2025 16:50:42 -0000 1.143 +++ uvm/uvm_pdaemon.c 23 Dec 2025 11:49:53 -0000 @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw int result; if (swc->swc_slot == 0) - return 0; // XXX + return 0; KASSERT(swc->swc_nused <= swc->swc_nallocated); slot = swc->swc_slot; @@ -633,141 +633,180 @@ uvmpd_scan_inactive(struct uvm_pmalloc * /* Insert iterator. */ TAILQ_INSERT_AFTER(pglst, p, &iter, pageq); - for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) { + for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) { /* - * note that p can be NULL iff we have traversed the whole - * list and need to do one final swap-backed clustered pageout. + * see if we've met our target */ - uobj = NULL; - anon = NULL; - if (p) { - /* - * see if we've met our target - */ - if ((uvmpd_pma_done(pma) && - (uvmexp.paging + swapcluster_nused(&swc) - >= (shortage - freed))) || - dirtyreacts == UVMPD_NUMDIRTYREACTS) { - if (swc.swc_slot == 0) { - /* exit now if no swap-i/o pending */ - break; - } + if ((uvmpd_pma_done(pma) && + (uvmexp.paging + swapcluster_nused(&swc) + >= (shortage - freed))) || + dirtyreacts == UVMPD_NUMDIRTYREACTS) { + break; + } + /* + * we are below target and have a new page to consider. + */ + uvmexp.pdscans++; - /* set p to null to signal final swap i/o */ - p = NULL; - } + /* + * If we are not short on memory and only interested + * in releasing pages from a given memory range, do not + * bother with other pages. + */ + if (uvmexp.paging >= (shortage - freed) && + !uvmpd_pma_done(pma) && + !uvmpd_match_constraint(p, &pma->pm_constraint)) + continue; + + anon = p->uanon; + uobj = p->uobject; + + /* + * first we attempt to lock the object that this page + * belongs to. if our attempt fails we skip on to + * the next page (no harm done). it is important to + * "try" locking the object as we are locking in the + * wrong order (pageq -> object) and we don't want to + * deadlock. + */ + slock = uvmpd_trylockowner(p); + if (slock == NULL) { + continue; } - if (p) { /* if (we have a new page to consider) */ - /* - * we are below target and have a new page to consider. - */ - uvmexp.pdscans++; - /* - * If we are not short on memory and only interested - * in releasing pages from a given memory range, do not - * bother with other pages. - */ - if (uvmexp.paging >= (shortage - freed) && - !uvmpd_pma_done(pma) && - !uvmpd_match_constraint(p, &pma->pm_constraint)) - continue; + /* + * move referenced pages back to active queue + * and skip to next page. + */ + if (pmap_is_referenced(p)) { + uvm_unlock_pageq(); + uvm_pageactivate(p); + rw_exit(slock); + uvm_lock_pageq(); + uvmexp.pdreact++; + continue; + } - anon = p->uanon; - uobj = p->uobject; + if (p->pg_flags & PG_BUSY) { + rw_exit(slock); + uvmexp.pdbusy++; + continue; + } - /* - * first we attempt to lock the object that this page - * belongs to. if our attempt fails we skip on to - * the next page (no harm done). it is important to - * "try" locking the object as we are locking in the - * wrong order (pageq -> object) and we don't want to - * deadlock. - */ - slock = uvmpd_trylockowner(p); - if (slock == NULL) { - continue; - } + /* does the page belong to an object? */ + if (uobj != NULL) { + uvmexp.pdobscan++; + } else { + KASSERT(anon != NULL); + uvmexp.pdanscan++; + } - /* - * move referenced pages back to active queue - * and skip to next page. - */ - if (pmap_is_referenced(p)) { - uvm_unlock_pageq(); - uvm_pageactivate(p); - rw_exit(slock); - uvm_lock_pageq(); - uvmexp.pdreact++; - continue; + /* + * we now have the page queues locked. + * the page is not busy. if the page is clean we + * can free it now and continue. + */ + if (p->pg_flags & PG_CLEAN) { + if (p->pg_flags & PQ_SWAPBACKED) { + /* this page now lives only in swap */ + atomic_inc_int(&uvmexp.swpgonly); } - if (p->pg_flags & PG_BUSY) { - rw_exit(slock); - uvmexp.pdbusy++; - continue; - } + /* zap all mappings with pmap_page_protect... */ + pmap_page_protect(p, PROT_NONE); + /* dequeue first to prevent lock recursion */ + if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) + uvm_pagedequeue(p); + uvm_pagefree(p); + freed++; + + if (anon) { + + /* + * an anonymous page can only be clean + * if it has backing store assigned. + */ - /* does the page belong to an object? */ - if (uobj != NULL) { - uvmexp.pdobscan++; - } else { - KASSERT(anon != NULL); - uvmexp.pdanscan++; - } + KASSERT(anon->an_swslot != 0); - /* - * we now have the page queues locked. - * the page is not busy. if the page is clean we - * can free it now and continue. - */ - if (p->pg_flags & PG_CLEAN) { - if (p->pg_flags & PQ_SWAPBACKED) { - /* this page now lives only in swap */ - atomic_inc_int(&uvmexp.swpgonly); - } - - /* zap all mappings with pmap_page_protect... */ - pmap_page_protect(p, PROT_NONE); - /* dequeue first to prevent lock recursion */ - if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) - uvm_pagedequeue(p); - uvm_pagefree(p); - freed++; - - if (anon) { - - /* - * an anonymous page can only be clean - * if it has backing store assigned. - */ - - KASSERT(anon->an_swslot != 0); - - /* remove from object */ - anon->an_page = NULL; - } - rw_exit(slock); - continue; + /* remove from object */ + anon->an_page = NULL; } + rw_exit(slock); + continue; + } - /* - * this page is dirty, skip it if we'll have met our - * free target when all the current pageouts complete. - */ - if (uvmpd_pma_done(pma) && - (uvmexp.paging > (shortage - freed))) { + /* + * this page is dirty, skip it if we'll have met our + * free target when all the current pageouts complete. + */ + if (uvmpd_pma_done(pma) && + (uvmexp.paging > (shortage - freed))) { + rw_exit(slock); + continue; + } + + /* + * this page is dirty, but we can't page it out + * since all pages in swap are only in swap. + * reactivate it so that we eventually cycle + * all pages thru the inactive queue. + */ + if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) { + dirtyreacts++; + uvm_unlock_pageq(); + uvm_pageactivate(p); + rw_exit(slock); + uvm_lock_pageq(); + continue; + } + + /* + * if the page is swap-backed and dirty and swap space + * is full, free any swap allocated to the page + * so that other pages can be paged out. + */ + if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled()) + uvmpd_dropswap(p); + + /* + * the page we are looking at is dirty. we must + * clean it before it can be freed. to do this we + * first mark the page busy so that no one else will + * touch the page. we write protect all the mappings + * of the page so that no one touches it while it is + * in I/O. + */ + swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0); + atomic_setbits_int(&p->pg_flags, PG_BUSY); + UVM_PAGE_OWN(p, "scan_inactive"); + pmap_page_protect(p, PROT_READ); + uvmexp.pgswapout++; + + /* + * for swap-backed pages we need to (re)allocate + * swap space. + */ + if (swap_backed) { + /* free old swap slot (if any) */ + uvmpd_dropswap(p); + + /* start new cluster (if necessary) */ + if (swapcluster_allocslots(&swc)) { + atomic_clearbits_int(&p->pg_flags, PG_BUSY); + UVM_PAGE_OWN(p, NULL); + dirtyreacts++; + uvm_unlock_pageq(); + uvm_pageactivate(p); rw_exit(slock); + uvm_lock_pageq(); continue; } - /* - * this page is dirty, but we can't page it out - * since all pages in swap are only in swap. - * reactivate it so that we eventually cycle - * all pages thru the inactive queue. - */ - if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) { + /* add block to cluster */ + if (swapcluster_add(&swc, p)) { + atomic_clearbits_int(&p->pg_flags, PG_BUSY); + UVM_PAGE_OWN(p, NULL); dirtyreacts++; uvm_unlock_pageq(); uvm_pageactivate(p); @@ -775,72 +814,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc * uvm_lock_pageq(); continue; } + rw_exit(slock); - /* - * if the page is swap-backed and dirty and swap space - * is full, free any swap allocated to the page - * so that other pages can be paged out. - */ - if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled()) - uvmpd_dropswap(p); - - /* - * the page we are looking at is dirty. we must - * clean it before it can be freed. to do this we - * first mark the page busy so that no one else will - * touch the page. we write protect all the mappings - * of the page so that no one touches it while it is - * in I/O. - */ - - swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0); - atomic_setbits_int(&p->pg_flags, PG_BUSY); - UVM_PAGE_OWN(p, "scan_inactive"); - pmap_page_protect(p, PROT_READ); - uvmexp.pgswapout++; - - /* - * for swap-backed pages we need to (re)allocate - * swap space. - */ - if (swap_backed) { - /* free old swap slot (if any) */ - uvmpd_dropswap(p); - - /* start new cluster (if necessary) */ - if (swapcluster_allocslots(&swc)) { - atomic_clearbits_int(&p->pg_flags, - PG_BUSY); - UVM_PAGE_OWN(p, NULL); - dirtyreacts++; - uvm_unlock_pageq(); - uvm_pageactivate(p); - rw_exit(slock); - uvm_lock_pageq(); - continue; - } - - /* add block to cluster */ - if (swapcluster_add(&swc, p)) { - atomic_clearbits_int(&p->pg_flags, - PG_BUSY); - UVM_PAGE_OWN(p, NULL); - dirtyreacts++; - uvm_unlock_pageq(); - uvm_pageactivate(p); - rw_exit(slock); - uvm_lock_pageq(); - continue; - } - rw_exit(slock); - - /* cluster not full yet? */ - if (swc.swc_nused < swc.swc_nallocated) - continue; - } - } else { - /* if p == NULL we must be doing a last swap i/o */ - swap_backed = TRUE; + /* cluster not full yet? */ + if (swc.swc_nused < swc.swc_nallocated) + continue; } /* @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc * } } TAILQ_REMOVE(pglst, &iter, pageq); + + /* final swap-backed clustered pageout */ + if (swc.swc_slot > 0) { + uvm_unlock_pageq(); + npages = swc.swc_nused; + result = swapcluster_flush(&swc); + uvm_lock_pageq(); + if (result == VM_PAGER_PEND) { + atomic_add_int(&uvmexp.paging, npages); + uvmexp.pdpending++; + } + } return freed; }