Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Make pdaemon understandable
To:
Theo Buehler <tb@openbsd.org>, sthen@openbsd.org, tech@openbsd.org
Date:
Tue, 23 Dec 2025 22:42:00 +0100

Download raw body.

Thread
On Tue, Dec 23, 2025 at 12:54:34PM +0100, Martin Pieuchot wrote:
> On 22/12/25(Mon) 20:39, Theo Buehler wrote:
> > On Mon, Dec 22, 2025 at 12:11:47PM +0100, Martin Pieuchot wrote:
> > > Diff below has been generated with "-b" to be easily reviewable.
> > > Attached is the one I intend to commit which mostly includes indent
> > > changes.
> > > 
> > > It moves the final pageout of swap cluster outside of the loop.  This
> > > allows to remove the NULL checks and make the body of the loop readable.
> > > 
> > > I intentionally didn't change anything *inside* the loop even if we now
> > > see incoherency.
> > > 
> > 
> > This reads mostly fine. One thing:
> > 
> > > @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> > >  
> > >  	/* Insert iterator. */
> > >  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
> > > -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
> > > -		/*
> > > -		 * note that p can be NULL iff we have traversed the whole
> > > -		 * list and need to do one final swap-backed clustered pageout.
> > > -		 */
> > > -		uobj = NULL;
> > > -		anon = NULL;
> > > -		if (p) {
> > > +	while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL) {
> > 
> > I'm confused by this change. I would have expected a
> > 
> > do { ... } while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL);
> > 
> > loop. The first time around in the for loop things are applied to p
> > itself, not to nextpg as returned by the iterator.
> 
> Thanks a lot.  You're right, otherwise we skip the first page.  Here's
> an updated version of the diff that keeps the for () loop for now.
> 
> ok?

also passes regress

bluhm

> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -b -r1.143 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
> +++ uvm/uvm_pdaemon.c	23 Dec 2025 11:49:53 -0000
> @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
>  	int result;
>  
>  	if (swc->swc_slot == 0)
> -		return 0; // XXX
> +		return 0;
>  	KASSERT(swc->swc_nused <= swc->swc_nallocated);
>  
>  	slot = swc->swc_slot;
> @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  
>  	/* Insert iterator. */
>  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
> -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
> -		/*
> -		 * note that p can be NULL iff we have traversed the whole
> -		 * list and need to do one final swap-backed clustered pageout.
> -		 */
> -		uobj = NULL;
> -		anon = NULL;
> -		if (p) {
> +	for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) {
>  			/*
>  			 * see if we've met our target
>  			 */
> @@ -648,16 +641,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  			    (uvmexp.paging + swapcluster_nused(&swc)
>  			    >= (shortage - freed))) ||
>  			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
> -				if (swc.swc_slot == 0) {
> -					/* exit now if no swap-i/o pending */
>  					break;
>  				}
> -
> -				/* set p to null to signal final swap i/o */
> -				p = NULL;
> -			}
> -		}
> -		if (p) {	/* if (we have a new page to consider) */
>  			/*
>  			 * we are below target and have a new page to consider.
>  			 */
> @@ -792,7 +777,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  			 * of the page so that no one touches it while it is
>  			 * in I/O.
>  			 */
> -
>  			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
>  			atomic_setbits_int(&p->pg_flags, PG_BUSY);
>  			UVM_PAGE_OWN(p, "scan_inactive");
> @@ -809,8 +793,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  
>  				/* start new cluster (if necessary) */
>  				if (swapcluster_allocslots(&swc)) {
> -					atomic_clearbits_int(&p->pg_flags,
> -					    PG_BUSY);
> +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
>  					UVM_PAGE_OWN(p, NULL);
>  					dirtyreacts++;
>  					uvm_unlock_pageq();
> @@ -822,8 +805,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  
>  				/* add block to cluster */
>  				if (swapcluster_add(&swc, p)) {
> -					atomic_clearbits_int(&p->pg_flags,
> -					    PG_BUSY);
> +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
>  					UVM_PAGE_OWN(p, NULL);
>  					dirtyreacts++;
>  					uvm_unlock_pageq();
> @@ -838,10 +820,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  				if (swc.swc_nused < swc.swc_nallocated)
>  					continue;
>  			}
> -		} else {
> -			/* if p == NULL we must be doing a last swap i/o */
> -			swap_backed = TRUE;
> -		}
>  
>  		/*
>  		 * now consider doing the pageout.
> @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  		}
>  	}
>  	TAILQ_REMOVE(pglst, &iter, pageq);
> +
> +	/* final swap-backed clustered pageout */
> +	if (swc.swc_slot > 0) {
> +		uvm_unlock_pageq();
> +		npages = swc.swc_nused;
> +		result = swapcluster_flush(&swc);
> +		uvm_lock_pageq();
> +		if (result == VM_PAGER_PEND) {
> +			atomic_add_int(&uvmexp.paging, npages);
> +			uvmexp.pdpending++;
> +		}
> +	}
>  
>  	return freed;
>  }

> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -r1.143 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
> +++ uvm/uvm_pdaemon.c	23 Dec 2025 11:49:53 -0000
> @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
>  	int result;
>  
>  	if (swc->swc_slot == 0)
> -		return 0; // XXX
> +		return 0;
>  	KASSERT(swc->swc_nused <= swc->swc_nallocated);
>  
>  	slot = swc->swc_slot;
> @@ -633,141 +633,180 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  
>  	/* Insert iterator. */
>  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
> -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
> +	for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) {
>  		/*
> -		 * note that p can be NULL iff we have traversed the whole
> -		 * list and need to do one final swap-backed clustered pageout.
> +		 * see if we've met our target
>  		 */
> -		uobj = NULL;
> -		anon = NULL;
> -		if (p) {
> -			/*
> -			 * see if we've met our target
> -			 */
> -			if ((uvmpd_pma_done(pma) &&
> -			    (uvmexp.paging + swapcluster_nused(&swc)
> -			    >= (shortage - freed))) ||
> -			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
> -				if (swc.swc_slot == 0) {
> -					/* exit now if no swap-i/o pending */
> -					break;
> -				}
> +		if ((uvmpd_pma_done(pma) &&
> +		    (uvmexp.paging + swapcluster_nused(&swc)
> +		    >= (shortage - freed))) ||
> +		    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
> +			break;
> +		}
> +		/*
> +		 * we are below target and have a new page to consider.
> +		 */
> +		uvmexp.pdscans++;
>  
> -				/* set p to null to signal final swap i/o */
> -				p = NULL;
> -			}
> +		/*
> +		 * If we are not short on memory and only interested
> +		 * in releasing pages from a given memory range, do not
> +		 * bother with other pages.
> +		 */
> +		if (uvmexp.paging >= (shortage - freed) &&
> +		    !uvmpd_pma_done(pma) &&
> +		    !uvmpd_match_constraint(p, &pma->pm_constraint))
> +			continue;
> +
> +		anon = p->uanon;
> +		uobj = p->uobject;
> +
> +		/*
> +		 * first we attempt to lock the object that this page
> +		 * belongs to.  if our attempt fails we skip on to
> +		 * the next page (no harm done).  it is important to
> +		 * "try" locking the object as we are locking in the
> +		 * wrong order (pageq -> object) and we don't want to
> +		 * deadlock.
> +		 */
> +		slock = uvmpd_trylockowner(p);
> +		if (slock == NULL) {
> +			continue;
>  		}
> -		if (p) {	/* if (we have a new page to consider) */
> -			/*
> -			 * we are below target and have a new page to consider.
> -			 */
> -			uvmexp.pdscans++;
>  
> -			/*
> -			 * If we are not short on memory and only interested
> -			 * in releasing pages from a given memory range, do not
> -			 * bother with other pages.
> -			 */
> -			if (uvmexp.paging >= (shortage - freed) &&
> -			    !uvmpd_pma_done(pma) &&
> -			    !uvmpd_match_constraint(p, &pma->pm_constraint))
> -				continue;
> +		/*
> +		 * move referenced pages back to active queue
> +		 * and skip to next page.
> +		 */
> +		if (pmap_is_referenced(p)) {
> +			uvm_unlock_pageq();
> +			uvm_pageactivate(p);
> +			rw_exit(slock);
> +			uvm_lock_pageq();
> +			uvmexp.pdreact++;
> +			continue;
> +		}
>  
> -			anon = p->uanon;
> -			uobj = p->uobject;
> +		if (p->pg_flags & PG_BUSY) {
> +			rw_exit(slock);
> +			uvmexp.pdbusy++;
> +			continue;
> +		}
>  
> -			/*
> -			 * first we attempt to lock the object that this page
> -			 * belongs to.  if our attempt fails we skip on to
> -			 * the next page (no harm done).  it is important to
> -			 * "try" locking the object as we are locking in the
> -			 * wrong order (pageq -> object) and we don't want to
> -			 * deadlock.
> -			 */
> -			slock = uvmpd_trylockowner(p);
> -			if (slock == NULL) {
> -				continue;
> -			}
> +		/* does the page belong to an object? */
> +		if (uobj != NULL) {
> +			uvmexp.pdobscan++;
> +		} else {
> +			KASSERT(anon != NULL);
> +			uvmexp.pdanscan++;
> +		}
>  
> -			/*
> -			 * move referenced pages back to active queue
> -			 * and skip to next page.
> -			 */
> -			if (pmap_is_referenced(p)) {
> -				uvm_unlock_pageq();
> -				uvm_pageactivate(p);
> -				rw_exit(slock);
> -				uvm_lock_pageq();
> -				uvmexp.pdreact++;
> -				continue;
> +		/*
> +		 * we now have the page queues locked.
> +		 * the page is not busy.   if the page is clean we
> +		 * can free it now and continue.
> +		 */
> +		if (p->pg_flags & PG_CLEAN) {
> +			if (p->pg_flags & PQ_SWAPBACKED) {
> +				/* this page now lives only in swap */
> +				atomic_inc_int(&uvmexp.swpgonly);
>  			}
>  
> -			if (p->pg_flags & PG_BUSY) {
> -				rw_exit(slock);
> -				uvmexp.pdbusy++;
> -				continue;
> -			}
> +			/* zap all mappings with pmap_page_protect... */
> +			pmap_page_protect(p, PROT_NONE);
> +			/* dequeue first to prevent lock recursion */
> +			if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
> +				uvm_pagedequeue(p);
> +			uvm_pagefree(p);
> +			freed++;
> +
> +			if (anon) {
> +
> +				/*
> +				 * an anonymous page can only be clean
> +				 * if it has backing store assigned.
> +				 */
>  
> -			/* does the page belong to an object? */
> -			if (uobj != NULL) {
> -				uvmexp.pdobscan++;
> -			} else {
> -				KASSERT(anon != NULL);
> -				uvmexp.pdanscan++;
> -			}
> +				KASSERT(anon->an_swslot != 0);
>  
> -			/*
> -			 * we now have the page queues locked.
> -			 * the page is not busy.   if the page is clean we
> -			 * can free it now and continue.
> -			 */
> -			if (p->pg_flags & PG_CLEAN) {
> -				if (p->pg_flags & PQ_SWAPBACKED) {
> -					/* this page now lives only in swap */
> -					atomic_inc_int(&uvmexp.swpgonly);
> -				}
> -
> -				/* zap all mappings with pmap_page_protect... */
> -				pmap_page_protect(p, PROT_NONE);
> -				/* dequeue first to prevent lock recursion */
> -				if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
> -					uvm_pagedequeue(p);
> -				uvm_pagefree(p);
> -				freed++;
> -
> -				if (anon) {
> -
> -					/*
> -					 * an anonymous page can only be clean
> -					 * if it has backing store assigned.
> -					 */
> -
> -					KASSERT(anon->an_swslot != 0);
> -
> -					/* remove from object */
> -					anon->an_page = NULL;
> -				}
> -				rw_exit(slock);
> -				continue;
> +				/* remove from object */
> +				anon->an_page = NULL;
>  			}
> +			rw_exit(slock);
> +			continue;
> +		}
>  
> -			/*
> -			 * this page is dirty, skip it if we'll have met our
> -			 * free target when all the current pageouts complete.
> -			 */
> -			if (uvmpd_pma_done(pma) &&
> -			    (uvmexp.paging > (shortage - freed))) {
> +		/*
> +		 * this page is dirty, skip it if we'll have met our
> +		 * free target when all the current pageouts complete.
> +		 */
> +		if (uvmpd_pma_done(pma) &&
> +		    (uvmexp.paging > (shortage - freed))) {
> +			rw_exit(slock);
> +			continue;
> +		}
> +
> +		/*
> +		 * this page is dirty, but we can't page it out
> +		 * since all pages in swap are only in swap.
> +		 * reactivate it so that we eventually cycle
> +		 * all pages thru the inactive queue.
> +		 */
> +		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
> +			dirtyreacts++;
> +			uvm_unlock_pageq();
> +			uvm_pageactivate(p);
> +			rw_exit(slock);
> +			uvm_lock_pageq();
> +			continue;
> +		}
> +
> +		/*
> +		 * if the page is swap-backed and dirty and swap space
> +		 * is full, free any swap allocated to the page
> +		 * so that other pages can be paged out.
> +		 */
> +		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
> +			uvmpd_dropswap(p);
> +
> +		/*
> +		 * the page we are looking at is dirty.   we must
> +		 * clean it before it can be freed.  to do this we
> +		 * first mark the page busy so that no one else will
> +		 * touch the page.   we write protect all the mappings
> +		 * of the page so that no one touches it while it is
> +		 * in I/O.
> +		 */
> +		swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
> +		atomic_setbits_int(&p->pg_flags, PG_BUSY);
> +		UVM_PAGE_OWN(p, "scan_inactive");
> +		pmap_page_protect(p, PROT_READ);
> +		uvmexp.pgswapout++;
> +
> +		/*
> +		 * for swap-backed pages we need to (re)allocate
> +		 * swap space.
> +		 */
> +		if (swap_backed) {
> +			/* free old swap slot (if any) */
> +			uvmpd_dropswap(p);
> +
> +			/* start new cluster (if necessary) */
> +			if (swapcluster_allocslots(&swc)) {
> +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
> +				UVM_PAGE_OWN(p, NULL);
> +				dirtyreacts++;
> +				uvm_unlock_pageq();
> +				uvm_pageactivate(p);
>  				rw_exit(slock);
> +				uvm_lock_pageq();
>  				continue;
>  			}
>  
> -			/*
> -			 * this page is dirty, but we can't page it out
> -			 * since all pages in swap are only in swap.
> -			 * reactivate it so that we eventually cycle
> -			 * all pages thru the inactive queue.
> -			 */
> -			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
> +			/* add block to cluster */
> +			if (swapcluster_add(&swc, p)) {
> +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
> +				UVM_PAGE_OWN(p, NULL);
>  				dirtyreacts++;
>  				uvm_unlock_pageq();
>  				uvm_pageactivate(p);
> @@ -775,72 +814,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  				uvm_lock_pageq();
>  				continue;
>  			}
> +			rw_exit(slock);
>  
> -			/*
> -			 * if the page is swap-backed and dirty and swap space
> -			 * is full, free any swap allocated to the page
> -			 * so that other pages can be paged out.
> -			 */
> -			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
> -				uvmpd_dropswap(p);
> -
> -			/*
> -			 * the page we are looking at is dirty.   we must
> -			 * clean it before it can be freed.  to do this we
> -			 * first mark the page busy so that no one else will
> -			 * touch the page.   we write protect all the mappings
> -			 * of the page so that no one touches it while it is
> -			 * in I/O.
> -			 */
> -
> -			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
> -			atomic_setbits_int(&p->pg_flags, PG_BUSY);
> -			UVM_PAGE_OWN(p, "scan_inactive");
> -			pmap_page_protect(p, PROT_READ);
> -			uvmexp.pgswapout++;
> -
> -			/*
> -			 * for swap-backed pages we need to (re)allocate
> -			 * swap space.
> -			 */
> -			if (swap_backed) {
> -				/* free old swap slot (if any) */
> -				uvmpd_dropswap(p);
> -
> -				/* start new cluster (if necessary) */
> -				if (swapcluster_allocslots(&swc)) {
> -					atomic_clearbits_int(&p->pg_flags,
> -					    PG_BUSY);
> -					UVM_PAGE_OWN(p, NULL);
> -					dirtyreacts++;
> -					uvm_unlock_pageq();
> -					uvm_pageactivate(p);
> -					rw_exit(slock);
> -					uvm_lock_pageq();
> -					continue;
> -				}
> -
> -				/* add block to cluster */
> -				if (swapcluster_add(&swc, p)) {
> -					atomic_clearbits_int(&p->pg_flags,
> -					    PG_BUSY);
> -					UVM_PAGE_OWN(p, NULL);
> -					dirtyreacts++;
> -					uvm_unlock_pageq();
> -					uvm_pageactivate(p);
> -					rw_exit(slock);
> -					uvm_lock_pageq();
> -					continue;
> -				}
> -				rw_exit(slock);
> -
> -				/* cluster not full yet? */
> -				if (swc.swc_nused < swc.swc_nallocated)
> -					continue;
> -			}
> -		} else {
> -			/* if p == NULL we must be doing a last swap i/o */
> -			swap_backed = TRUE;
> +			/* cluster not full yet? */
> +			if (swc.swc_nused < swc.swc_nallocated)
> +				continue;
>  		}
>  
>  		/*
> @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  		}
>  	}
>  	TAILQ_REMOVE(pglst, &iter, pageq);
> +
> +	/* final swap-backed clustered pageout */
> +	if (swc.swc_slot > 0) {
> +		uvm_unlock_pageq();
> +		npages = swc.swc_nused;
> +		result = swapcluster_flush(&swc);
> +		uvm_lock_pageq();
> +		if (result == VM_PAGER_PEND) {
> +			atomic_add_int(&uvmexp.paging, npages);
> +			uvmexp.pdpending++;
> +		}
> +	}
>  
>  	return freed;
>  }