Index | Thread | Search

From:
Martin Pieuchot <mpi@grenadille.net>
Subject:
Re: Make pdaemon understandable
To:
Theo Buehler <tb@openbsd.org>
Cc:
sthen@openbsd.org, bluhm@openbsd.org, tech@openbsd.org
Date:
Tue, 23 Dec 2025 12:54:34 +0100

Download raw body.

Thread
On 22/12/25(Mon) 20:39, Theo Buehler wrote:
> On Mon, Dec 22, 2025 at 12:11:47PM +0100, Martin Pieuchot wrote:
> > Diff below has been generated with "-b" to be easily reviewable.
> > Attached is the one I intend to commit which mostly includes indent
> > changes.
> > 
> > It moves the final pageout of swap cluster outside of the loop.  This
> > allows to remove the NULL checks and make the body of the loop readable.
> > 
> > I intentionally didn't change anything *inside* the loop even if we now
> > see incoherency.
> > 
> 
> This reads mostly fine. One thing:
> 
> > @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> >  
> >  	/* Insert iterator. */
> >  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
> > -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
> > -		/*
> > -		 * note that p can be NULL iff we have traversed the whole
> > -		 * list and need to do one final swap-backed clustered pageout.
> > -		 */
> > -		uobj = NULL;
> > -		anon = NULL;
> > -		if (p) {
> > +	while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL) {
> 
> I'm confused by this change. I would have expected a
> 
> do { ... } while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL);
> 
> loop. The first time around in the for loop things are applied to p
> itself, not to nextpg as returned by the iterator.

Thanks a lot.  You're right, otherwise we skip the first page.  Here's
an updated version of the diff that keeps the for () loop for now.

ok?

Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
diff -u -p -b -r1.143 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
+++ uvm/uvm_pdaemon.c	23 Dec 2025 11:49:53 -0000
@@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
 	int result;
 
 	if (swc->swc_slot == 0)
-		return 0; // XXX
+		return 0;
 	KASSERT(swc->swc_nused <= swc->swc_nallocated);
 
 	slot = swc->swc_slot;
@@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
 	/* Insert iterator. */
 	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
-	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
-		/*
-		 * note that p can be NULL iff we have traversed the whole
-		 * list and need to do one final swap-backed clustered pageout.
-		 */
-		uobj = NULL;
-		anon = NULL;
-		if (p) {
+	for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) {
 			/*
 			 * see if we've met our target
 			 */
@@ -648,16 +641,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 			    (uvmexp.paging + swapcluster_nused(&swc)
 			    >= (shortage - freed))) ||
 			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
-				if (swc.swc_slot == 0) {
-					/* exit now if no swap-i/o pending */
 					break;
 				}
-
-				/* set p to null to signal final swap i/o */
-				p = NULL;
-			}
-		}
-		if (p) {	/* if (we have a new page to consider) */
 			/*
 			 * we are below target and have a new page to consider.
 			 */
@@ -792,7 +777,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 			 * of the page so that no one touches it while it is
 			 * in I/O.
 			 */
-
 			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
 			atomic_setbits_int(&p->pg_flags, PG_BUSY);
 			UVM_PAGE_OWN(p, "scan_inactive");
@@ -809,8 +793,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
 				/* start new cluster (if necessary) */
 				if (swapcluster_allocslots(&swc)) {
-					atomic_clearbits_int(&p->pg_flags,
-					    PG_BUSY);
+				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
 					UVM_PAGE_OWN(p, NULL);
 					dirtyreacts++;
 					uvm_unlock_pageq();
@@ -822,8 +805,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
 				/* add block to cluster */
 				if (swapcluster_add(&swc, p)) {
-					atomic_clearbits_int(&p->pg_flags,
-					    PG_BUSY);
+				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
 					UVM_PAGE_OWN(p, NULL);
 					dirtyreacts++;
 					uvm_unlock_pageq();
@@ -838,10 +820,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 				if (swc.swc_nused < swc.swc_nallocated)
 					continue;
 			}
-		} else {
-			/* if p == NULL we must be doing a last swap i/o */
-			swap_backed = TRUE;
-		}
 
 		/*
 		 * now consider doing the pageout.
@@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 		}
 	}
 	TAILQ_REMOVE(pglst, &iter, pageq);
+
+	/* final swap-backed clustered pageout */
+	if (swc.swc_slot > 0) {
+		uvm_unlock_pageq();
+		npages = swc.swc_nused;
+		result = swapcluster_flush(&swc);
+		uvm_lock_pageq();
+		if (result == VM_PAGER_PEND) {
+			atomic_add_int(&uvmexp.paging, npages);
+			uvmexp.pdpending++;
+		}
+	}
 
 	return freed;
 }
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
diff -u -p -r1.143 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
+++ uvm/uvm_pdaemon.c	23 Dec 2025 11:49:53 -0000
@@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
 	int result;
 
 	if (swc->swc_slot == 0)
-		return 0; // XXX
+		return 0;
 	KASSERT(swc->swc_nused <= swc->swc_nallocated);
 
 	slot = swc->swc_slot;
@@ -633,141 +633,180 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 
 	/* Insert iterator. */
 	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
-	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
+	for (; p != NULL; p = uvmpd_iterator(pglst, p, &iter)) {
 		/*
-		 * note that p can be NULL iff we have traversed the whole
-		 * list and need to do one final swap-backed clustered pageout.
+		 * see if we've met our target
 		 */
-		uobj = NULL;
-		anon = NULL;
-		if (p) {
-			/*
-			 * see if we've met our target
-			 */
-			if ((uvmpd_pma_done(pma) &&
-			    (uvmexp.paging + swapcluster_nused(&swc)
-			    >= (shortage - freed))) ||
-			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
-				if (swc.swc_slot == 0) {
-					/* exit now if no swap-i/o pending */
-					break;
-				}
+		if ((uvmpd_pma_done(pma) &&
+		    (uvmexp.paging + swapcluster_nused(&swc)
+		    >= (shortage - freed))) ||
+		    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
+			break;
+		}
+		/*
+		 * we are below target and have a new page to consider.
+		 */
+		uvmexp.pdscans++;
 
-				/* set p to null to signal final swap i/o */
-				p = NULL;
-			}
+		/*
+		 * If we are not short on memory and only interested
+		 * in releasing pages from a given memory range, do not
+		 * bother with other pages.
+		 */
+		if (uvmexp.paging >= (shortage - freed) &&
+		    !uvmpd_pma_done(pma) &&
+		    !uvmpd_match_constraint(p, &pma->pm_constraint))
+			continue;
+
+		anon = p->uanon;
+		uobj = p->uobject;
+
+		/*
+		 * first we attempt to lock the object that this page
+		 * belongs to.  if our attempt fails we skip on to
+		 * the next page (no harm done).  it is important to
+		 * "try" locking the object as we are locking in the
+		 * wrong order (pageq -> object) and we don't want to
+		 * deadlock.
+		 */
+		slock = uvmpd_trylockowner(p);
+		if (slock == NULL) {
+			continue;
 		}
-		if (p) {	/* if (we have a new page to consider) */
-			/*
-			 * we are below target and have a new page to consider.
-			 */
-			uvmexp.pdscans++;
 
-			/*
-			 * If we are not short on memory and only interested
-			 * in releasing pages from a given memory range, do not
-			 * bother with other pages.
-			 */
-			if (uvmexp.paging >= (shortage - freed) &&
-			    !uvmpd_pma_done(pma) &&
-			    !uvmpd_match_constraint(p, &pma->pm_constraint))
-				continue;
+		/*
+		 * move referenced pages back to active queue
+		 * and skip to next page.
+		 */
+		if (pmap_is_referenced(p)) {
+			uvm_unlock_pageq();
+			uvm_pageactivate(p);
+			rw_exit(slock);
+			uvm_lock_pageq();
+			uvmexp.pdreact++;
+			continue;
+		}
 
-			anon = p->uanon;
-			uobj = p->uobject;
+		if (p->pg_flags & PG_BUSY) {
+			rw_exit(slock);
+			uvmexp.pdbusy++;
+			continue;
+		}
 
-			/*
-			 * first we attempt to lock the object that this page
-			 * belongs to.  if our attempt fails we skip on to
-			 * the next page (no harm done).  it is important to
-			 * "try" locking the object as we are locking in the
-			 * wrong order (pageq -> object) and we don't want to
-			 * deadlock.
-			 */
-			slock = uvmpd_trylockowner(p);
-			if (slock == NULL) {
-				continue;
-			}
+		/* does the page belong to an object? */
+		if (uobj != NULL) {
+			uvmexp.pdobscan++;
+		} else {
+			KASSERT(anon != NULL);
+			uvmexp.pdanscan++;
+		}
 
-			/*
-			 * move referenced pages back to active queue
-			 * and skip to next page.
-			 */
-			if (pmap_is_referenced(p)) {
-				uvm_unlock_pageq();
-				uvm_pageactivate(p);
-				rw_exit(slock);
-				uvm_lock_pageq();
-				uvmexp.pdreact++;
-				continue;
+		/*
+		 * we now have the page queues locked.
+		 * the page is not busy.   if the page is clean we
+		 * can free it now and continue.
+		 */
+		if (p->pg_flags & PG_CLEAN) {
+			if (p->pg_flags & PQ_SWAPBACKED) {
+				/* this page now lives only in swap */
+				atomic_inc_int(&uvmexp.swpgonly);
 			}
 
-			if (p->pg_flags & PG_BUSY) {
-				rw_exit(slock);
-				uvmexp.pdbusy++;
-				continue;
-			}
+			/* zap all mappings with pmap_page_protect... */
+			pmap_page_protect(p, PROT_NONE);
+			/* dequeue first to prevent lock recursion */
+			if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
+				uvm_pagedequeue(p);
+			uvm_pagefree(p);
+			freed++;
+
+			if (anon) {
+
+				/*
+				 * an anonymous page can only be clean
+				 * if it has backing store assigned.
+				 */
 
-			/* does the page belong to an object? */
-			if (uobj != NULL) {
-				uvmexp.pdobscan++;
-			} else {
-				KASSERT(anon != NULL);
-				uvmexp.pdanscan++;
-			}
+				KASSERT(anon->an_swslot != 0);
 
-			/*
-			 * we now have the page queues locked.
-			 * the page is not busy.   if the page is clean we
-			 * can free it now and continue.
-			 */
-			if (p->pg_flags & PG_CLEAN) {
-				if (p->pg_flags & PQ_SWAPBACKED) {
-					/* this page now lives only in swap */
-					atomic_inc_int(&uvmexp.swpgonly);
-				}
-
-				/* zap all mappings with pmap_page_protect... */
-				pmap_page_protect(p, PROT_NONE);
-				/* dequeue first to prevent lock recursion */
-				if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
-					uvm_pagedequeue(p);
-				uvm_pagefree(p);
-				freed++;
-
-				if (anon) {
-
-					/*
-					 * an anonymous page can only be clean
-					 * if it has backing store assigned.
-					 */
-
-					KASSERT(anon->an_swslot != 0);
-
-					/* remove from object */
-					anon->an_page = NULL;
-				}
-				rw_exit(slock);
-				continue;
+				/* remove from object */
+				anon->an_page = NULL;
 			}
+			rw_exit(slock);
+			continue;
+		}
 
-			/*
-			 * this page is dirty, skip it if we'll have met our
-			 * free target when all the current pageouts complete.
-			 */
-			if (uvmpd_pma_done(pma) &&
-			    (uvmexp.paging > (shortage - freed))) {
+		/*
+		 * this page is dirty, skip it if we'll have met our
+		 * free target when all the current pageouts complete.
+		 */
+		if (uvmpd_pma_done(pma) &&
+		    (uvmexp.paging > (shortage - freed))) {
+			rw_exit(slock);
+			continue;
+		}
+
+		/*
+		 * this page is dirty, but we can't page it out
+		 * since all pages in swap are only in swap.
+		 * reactivate it so that we eventually cycle
+		 * all pages thru the inactive queue.
+		 */
+		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
+			dirtyreacts++;
+			uvm_unlock_pageq();
+			uvm_pageactivate(p);
+			rw_exit(slock);
+			uvm_lock_pageq();
+			continue;
+		}
+
+		/*
+		 * if the page is swap-backed and dirty and swap space
+		 * is full, free any swap allocated to the page
+		 * so that other pages can be paged out.
+		 */
+		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
+			uvmpd_dropswap(p);
+
+		/*
+		 * the page we are looking at is dirty.   we must
+		 * clean it before it can be freed.  to do this we
+		 * first mark the page busy so that no one else will
+		 * touch the page.   we write protect all the mappings
+		 * of the page so that no one touches it while it is
+		 * in I/O.
+		 */
+		swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
+		atomic_setbits_int(&p->pg_flags, PG_BUSY);
+		UVM_PAGE_OWN(p, "scan_inactive");
+		pmap_page_protect(p, PROT_READ);
+		uvmexp.pgswapout++;
+
+		/*
+		 * for swap-backed pages we need to (re)allocate
+		 * swap space.
+		 */
+		if (swap_backed) {
+			/* free old swap slot (if any) */
+			uvmpd_dropswap(p);
+
+			/* start new cluster (if necessary) */
+			if (swapcluster_allocslots(&swc)) {
+				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
+				UVM_PAGE_OWN(p, NULL);
+				dirtyreacts++;
+				uvm_unlock_pageq();
+				uvm_pageactivate(p);
 				rw_exit(slock);
+				uvm_lock_pageq();
 				continue;
 			}
 
-			/*
-			 * this page is dirty, but we can't page it out
-			 * since all pages in swap are only in swap.
-			 * reactivate it so that we eventually cycle
-			 * all pages thru the inactive queue.
-			 */
-			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
+			/* add block to cluster */
+			if (swapcluster_add(&swc, p)) {
+				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
+				UVM_PAGE_OWN(p, NULL);
 				dirtyreacts++;
 				uvm_unlock_pageq();
 				uvm_pageactivate(p);
@@ -775,72 +814,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 				uvm_lock_pageq();
 				continue;
 			}
+			rw_exit(slock);
 
-			/*
-			 * if the page is swap-backed and dirty and swap space
-			 * is full, free any swap allocated to the page
-			 * so that other pages can be paged out.
-			 */
-			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
-				uvmpd_dropswap(p);
-
-			/*
-			 * the page we are looking at is dirty.   we must
-			 * clean it before it can be freed.  to do this we
-			 * first mark the page busy so that no one else will
-			 * touch the page.   we write protect all the mappings
-			 * of the page so that no one touches it while it is
-			 * in I/O.
-			 */
-
-			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
-			atomic_setbits_int(&p->pg_flags, PG_BUSY);
-			UVM_PAGE_OWN(p, "scan_inactive");
-			pmap_page_protect(p, PROT_READ);
-			uvmexp.pgswapout++;
-
-			/*
-			 * for swap-backed pages we need to (re)allocate
-			 * swap space.
-			 */
-			if (swap_backed) {
-				/* free old swap slot (if any) */
-				uvmpd_dropswap(p);
-
-				/* start new cluster (if necessary) */
-				if (swapcluster_allocslots(&swc)) {
-					atomic_clearbits_int(&p->pg_flags,
-					    PG_BUSY);
-					UVM_PAGE_OWN(p, NULL);
-					dirtyreacts++;
-					uvm_unlock_pageq();
-					uvm_pageactivate(p);
-					rw_exit(slock);
-					uvm_lock_pageq();
-					continue;
-				}
-
-				/* add block to cluster */
-				if (swapcluster_add(&swc, p)) {
-					atomic_clearbits_int(&p->pg_flags,
-					    PG_BUSY);
-					UVM_PAGE_OWN(p, NULL);
-					dirtyreacts++;
-					uvm_unlock_pageq();
-					uvm_pageactivate(p);
-					rw_exit(slock);
-					uvm_lock_pageq();
-					continue;
-				}
-				rw_exit(slock);
-
-				/* cluster not full yet? */
-				if (swc.swc_nused < swc.swc_nallocated)
-					continue;
-			}
-		} else {
-			/* if p == NULL we must be doing a last swap i/o */
-			swap_backed = TRUE;
+			/* cluster not full yet? */
+			if (swc.swc_nused < swc.swc_nallocated)
+				continue;
 		}
 
 		/*
@@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
 		}
 	}
 	TAILQ_REMOVE(pglst, &iter, pageq);
+
+	/* final swap-backed clustered pageout */
+	if (swc.swc_slot > 0) {
+		uvm_unlock_pageq();
+		npages = swc.swc_nused;
+		result = swapcluster_flush(&swc);
+		uvm_lock_pageq();
+		if (result == VM_PAGER_PEND) {
+			atomic_add_int(&uvmexp.paging, npages);
+			uvmexp.pdpending++;
+		}
+	}
 
 	return freed;
 }