Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: Make pdaemon understandable
To:
tech@openbsd.org
Date:
Mon, 22 Dec 2025 22:53:03 +0100

Download raw body.

Thread
  • Alexander Bluhm:

    Make pdaemon understandable

  • On Mon, Dec 22, 2025 at 12:11:47PM +0100, Martin Pieuchot wrote:
    > Diff below has been generated with "-b" to be easily reviewable.
    > Attached is the one I intend to commit which mostly includes indent
    > changes.
    > 
    > It moves the final pageout of swap cluster outside of the loop.  This
    > allows to remove the NULL checks and make the body of the loop readable.
    > 
    > I intentionally didn't change anything *inside* the loop even if we now
    > see incoherency.
    > 
    > ok?
    
    this diff passeed regress with witness on i386
    
    bluhm
    
    > Index: uvm/uvm_pdaemon.c
    > ===================================================================
    > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
    > diff -u -p -b -r1.143 uvm_pdaemon.c
    > --- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
    > +++ uvm/uvm_pdaemon.c	22 Dec 2025 11:03:02 -0000
    > @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
    >  	int result;
    >  
    >  	if (swc->swc_slot == 0)
    > -		return 0; // XXX
    > +		return 0;
    >  	KASSERT(swc->swc_nused <= swc->swc_nallocated);
    >  
    >  	slot = swc->swc_slot;
    > @@ -633,14 +633,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  
    >  	/* Insert iterator. */
    >  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
    > -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
    > -		/*
    > -		 * note that p can be NULL iff we have traversed the whole
    > -		 * list and need to do one final swap-backed clustered pageout.
    > -		 */
    > -		uobj = NULL;
    > -		anon = NULL;
    > -		if (p) {
    > +	while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL) {
    >  			/*
    >  			 * see if we've met our target
    >  			 */
    > @@ -648,16 +641,8 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  			    (uvmexp.paging + swapcluster_nused(&swc)
    >  			    >= (shortage - freed))) ||
    >  			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
    > -				if (swc.swc_slot == 0) {
    > -					/* exit now if no swap-i/o pending */
    >  					break;
    >  				}
    > -
    > -				/* set p to null to signal final swap i/o */
    > -				p = NULL;
    > -			}
    > -		}
    > -		if (p) {	/* if (we have a new page to consider) */
    >  			/*
    >  			 * we are below target and have a new page to consider.
    >  			 */
    > @@ -792,7 +777,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  			 * of the page so that no one touches it while it is
    >  			 * in I/O.
    >  			 */
    > -
    >  			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
    >  			atomic_setbits_int(&p->pg_flags, PG_BUSY);
    >  			UVM_PAGE_OWN(p, "scan_inactive");
    > @@ -809,8 +793,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  
    >  				/* start new cluster (if necessary) */
    >  				if (swapcluster_allocslots(&swc)) {
    > -					atomic_clearbits_int(&p->pg_flags,
    > -					    PG_BUSY);
    > +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
    >  					UVM_PAGE_OWN(p, NULL);
    >  					dirtyreacts++;
    >  					uvm_unlock_pageq();
    > @@ -822,8 +805,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  
    >  				/* add block to cluster */
    >  				if (swapcluster_add(&swc, p)) {
    > -					atomic_clearbits_int(&p->pg_flags,
    > -					    PG_BUSY);
    > +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
    >  					UVM_PAGE_OWN(p, NULL);
    >  					dirtyreacts++;
    >  					uvm_unlock_pageq();
    > @@ -838,10 +820,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  				if (swc.swc_nused < swc.swc_nallocated)
    >  					continue;
    >  			}
    > -		} else {
    > -			/* if p == NULL we must be doing a last swap i/o */
    > -			swap_backed = TRUE;
    > -		}
    >  
    >  		/*
    >  		 * now consider doing the pageout.
    > @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  		}
    >  	}
    >  	TAILQ_REMOVE(pglst, &iter, pageq);
    > +
    > +	/* final swap-backed clustered pageout */
    > +	if (swc.swc_slot > 0) {
    > +		uvm_unlock_pageq();
    > +		npages = swc.swc_nused;
    > +		result = swapcluster_flush(&swc);
    > +		uvm_lock_pageq();
    > +		if (result == VM_PAGER_PEND) {
    > +			atomic_add_int(&uvmexp.paging, npages);
    > +			uvmexp.pdpending++;
    > +		}
    > +	}
    >  
    >  	return freed;
    >  }
    
    > Index: uvm/uvm_pdaemon.c
    > ===================================================================
    > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
    > diff -u -p -r1.143 uvm_pdaemon.c
    > --- uvm/uvm_pdaemon.c	18 Dec 2025 16:50:42 -0000	1.143
    > +++ uvm/uvm_pdaemon.c	22 Dec 2025 11:03:02 -0000
    > @@ -484,7 +484,7 @@ swapcluster_flush(struct swapcluster *sw
    >  	int result;
    >  
    >  	if (swc->swc_slot == 0)
    > -		return 0; // XXX
    > +		return 0;
    >  	KASSERT(swc->swc_nused <= swc->swc_nallocated);
    >  
    >  	slot = swc->swc_slot;
    > @@ -633,141 +633,180 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  
    >  	/* Insert iterator. */
    >  	TAILQ_INSERT_AFTER(pglst, p, &iter, pageq);
    > -	for (; p != NULL || swc.swc_slot != 0; p = uvmpd_iterator(pglst, p, &iter)) {
    > +	while ((p = uvmpd_iterator(pglst, p, &iter)) != NULL) {
    >  		/*
    > -		 * note that p can be NULL iff we have traversed the whole
    > -		 * list and need to do one final swap-backed clustered pageout.
    > +		 * see if we've met our target
    >  		 */
    > -		uobj = NULL;
    > -		anon = NULL;
    > -		if (p) {
    > -			/*
    > -			 * see if we've met our target
    > -			 */
    > -			if ((uvmpd_pma_done(pma) &&
    > -			    (uvmexp.paging + swapcluster_nused(&swc)
    > -			    >= (shortage - freed))) ||
    > -			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
    > -				if (swc.swc_slot == 0) {
    > -					/* exit now if no swap-i/o pending */
    > -					break;
    > -				}
    > +		if ((uvmpd_pma_done(pma) &&
    > +		    (uvmexp.paging + swapcluster_nused(&swc)
    > +		    >= (shortage - freed))) ||
    > +		    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
    > +			break;
    > +		}
    > +		/*
    > +		 * we are below target and have a new page to consider.
    > +		 */
    > +		uvmexp.pdscans++;
    >  
    > -				/* set p to null to signal final swap i/o */
    > -				p = NULL;
    > -			}
    > +		/*
    > +		 * If we are not short on memory and only interested
    > +		 * in releasing pages from a given memory range, do not
    > +		 * bother with other pages.
    > +		 */
    > +		if (uvmexp.paging >= (shortage - freed) &&
    > +		    !uvmpd_pma_done(pma) &&
    > +		    !uvmpd_match_constraint(p, &pma->pm_constraint))
    > +			continue;
    > +
    > +		anon = p->uanon;
    > +		uobj = p->uobject;
    > +
    > +		/*
    > +		 * first we attempt to lock the object that this page
    > +		 * belongs to.  if our attempt fails we skip on to
    > +		 * the next page (no harm done).  it is important to
    > +		 * "try" locking the object as we are locking in the
    > +		 * wrong order (pageq -> object) and we don't want to
    > +		 * deadlock.
    > +		 */
    > +		slock = uvmpd_trylockowner(p);
    > +		if (slock == NULL) {
    > +			continue;
    >  		}
    > -		if (p) {	/* if (we have a new page to consider) */
    > -			/*
    > -			 * we are below target and have a new page to consider.
    > -			 */
    > -			uvmexp.pdscans++;
    >  
    > -			/*
    > -			 * If we are not short on memory and only interested
    > -			 * in releasing pages from a given memory range, do not
    > -			 * bother with other pages.
    > -			 */
    > -			if (uvmexp.paging >= (shortage - freed) &&
    > -			    !uvmpd_pma_done(pma) &&
    > -			    !uvmpd_match_constraint(p, &pma->pm_constraint))
    > -				continue;
    > +		/*
    > +		 * move referenced pages back to active queue
    > +		 * and skip to next page.
    > +		 */
    > +		if (pmap_is_referenced(p)) {
    > +			uvm_unlock_pageq();
    > +			uvm_pageactivate(p);
    > +			rw_exit(slock);
    > +			uvm_lock_pageq();
    > +			uvmexp.pdreact++;
    > +			continue;
    > +		}
    >  
    > -			anon = p->uanon;
    > -			uobj = p->uobject;
    > +		if (p->pg_flags & PG_BUSY) {
    > +			rw_exit(slock);
    > +			uvmexp.pdbusy++;
    > +			continue;
    > +		}
    >  
    > -			/*
    > -			 * first we attempt to lock the object that this page
    > -			 * belongs to.  if our attempt fails we skip on to
    > -			 * the next page (no harm done).  it is important to
    > -			 * "try" locking the object as we are locking in the
    > -			 * wrong order (pageq -> object) and we don't want to
    > -			 * deadlock.
    > -			 */
    > -			slock = uvmpd_trylockowner(p);
    > -			if (slock == NULL) {
    > -				continue;
    > -			}
    > +		/* does the page belong to an object? */
    > +		if (uobj != NULL) {
    > +			uvmexp.pdobscan++;
    > +		} else {
    > +			KASSERT(anon != NULL);
    > +			uvmexp.pdanscan++;
    > +		}
    >  
    > -			/*
    > -			 * move referenced pages back to active queue
    > -			 * and skip to next page.
    > -			 */
    > -			if (pmap_is_referenced(p)) {
    > -				uvm_unlock_pageq();
    > -				uvm_pageactivate(p);
    > -				rw_exit(slock);
    > -				uvm_lock_pageq();
    > -				uvmexp.pdreact++;
    > -				continue;
    > +		/*
    > +		 * we now have the page queues locked.
    > +		 * the page is not busy.   if the page is clean we
    > +		 * can free it now and continue.
    > +		 */
    > +		if (p->pg_flags & PG_CLEAN) {
    > +			if (p->pg_flags & PQ_SWAPBACKED) {
    > +				/* this page now lives only in swap */
    > +				atomic_inc_int(&uvmexp.swpgonly);
    >  			}
    >  
    > -			if (p->pg_flags & PG_BUSY) {
    > -				rw_exit(slock);
    > -				uvmexp.pdbusy++;
    > -				continue;
    > -			}
    > +			/* zap all mappings with pmap_page_protect... */
    > +			pmap_page_protect(p, PROT_NONE);
    > +			/* dequeue first to prevent lock recursion */
    > +			if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
    > +				uvm_pagedequeue(p);
    > +			uvm_pagefree(p);
    > +			freed++;
    > +
    > +			if (anon) {
    > +
    > +				/*
    > +				 * an anonymous page can only be clean
    > +				 * if it has backing store assigned.
    > +				 */
    >  
    > -			/* does the page belong to an object? */
    > -			if (uobj != NULL) {
    > -				uvmexp.pdobscan++;
    > -			} else {
    > -				KASSERT(anon != NULL);
    > -				uvmexp.pdanscan++;
    > -			}
    > +				KASSERT(anon->an_swslot != 0);
    >  
    > -			/*
    > -			 * we now have the page queues locked.
    > -			 * the page is not busy.   if the page is clean we
    > -			 * can free it now and continue.
    > -			 */
    > -			if (p->pg_flags & PG_CLEAN) {
    > -				if (p->pg_flags & PQ_SWAPBACKED) {
    > -					/* this page now lives only in swap */
    > -					atomic_inc_int(&uvmexp.swpgonly);
    > -				}
    > -
    > -				/* zap all mappings with pmap_page_protect... */
    > -				pmap_page_protect(p, PROT_NONE);
    > -				/* dequeue first to prevent lock recursion */
    > -				if (p->pg_flags & (PQ_ACTIVE|PQ_INACTIVE))
    > -					uvm_pagedequeue(p);
    > -				uvm_pagefree(p);
    > -				freed++;
    > -
    > -				if (anon) {
    > -
    > -					/*
    > -					 * an anonymous page can only be clean
    > -					 * if it has backing store assigned.
    > -					 */
    > -
    > -					KASSERT(anon->an_swslot != 0);
    > -
    > -					/* remove from object */
    > -					anon->an_page = NULL;
    > -				}
    > -				rw_exit(slock);
    > -				continue;
    > +				/* remove from object */
    > +				anon->an_page = NULL;
    >  			}
    > +			rw_exit(slock);
    > +			continue;
    > +		}
    >  
    > -			/*
    > -			 * this page is dirty, skip it if we'll have met our
    > -			 * free target when all the current pageouts complete.
    > -			 */
    > -			if (uvmpd_pma_done(pma) &&
    > -			    (uvmexp.paging > (shortage - freed))) {
    > +		/*
    > +		 * this page is dirty, skip it if we'll have met our
    > +		 * free target when all the current pageouts complete.
    > +		 */
    > +		if (uvmpd_pma_done(pma) &&
    > +		    (uvmexp.paging > (shortage - freed))) {
    > +			rw_exit(slock);
    > +			continue;
    > +		}
    > +
    > +		/*
    > +		 * this page is dirty, but we can't page it out
    > +		 * since all pages in swap are only in swap.
    > +		 * reactivate it so that we eventually cycle
    > +		 * all pages thru the inactive queue.
    > +		 */
    > +		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
    > +			dirtyreacts++;
    > +			uvm_unlock_pageq();
    > +			uvm_pageactivate(p);
    > +			rw_exit(slock);
    > +			uvm_lock_pageq();
    > +			continue;
    > +		}
    > +
    > +		/*
    > +		 * if the page is swap-backed and dirty and swap space
    > +		 * is full, free any swap allocated to the page
    > +		 * so that other pages can be paged out.
    > +		 */
    > +		if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
    > +			uvmpd_dropswap(p);
    > +
    > +		/*
    > +		 * the page we are looking at is dirty.   we must
    > +		 * clean it before it can be freed.  to do this we
    > +		 * first mark the page busy so that no one else will
    > +		 * touch the page.   we write protect all the mappings
    > +		 * of the page so that no one touches it while it is
    > +		 * in I/O.
    > +		 */
    > +		swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
    > +		atomic_setbits_int(&p->pg_flags, PG_BUSY);
    > +		UVM_PAGE_OWN(p, "scan_inactive");
    > +		pmap_page_protect(p, PROT_READ);
    > +		uvmexp.pgswapout++;
    > +
    > +		/*
    > +		 * for swap-backed pages we need to (re)allocate
    > +		 * swap space.
    > +		 */
    > +		if (swap_backed) {
    > +			/* free old swap slot (if any) */
    > +			uvmpd_dropswap(p);
    > +
    > +			/* start new cluster (if necessary) */
    > +			if (swapcluster_allocslots(&swc)) {
    > +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
    > +				UVM_PAGE_OWN(p, NULL);
    > +				dirtyreacts++;
    > +				uvm_unlock_pageq();
    > +				uvm_pageactivate(p);
    >  				rw_exit(slock);
    > +				uvm_lock_pageq();
    >  				continue;
    >  			}
    >  
    > -			/*
    > -			 * this page is dirty, but we can't page it out
    > -			 * since all pages in swap are only in swap.
    > -			 * reactivate it so that we eventually cycle
    > -			 * all pages thru the inactive queue.
    > -			 */
    > -			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfull()) {
    > +			/* add block to cluster */
    > +			if (swapcluster_add(&swc, p)) {
    > +				atomic_clearbits_int(&p->pg_flags, PG_BUSY);
    > +				UVM_PAGE_OWN(p, NULL);
    >  				dirtyreacts++;
    >  				uvm_unlock_pageq();
    >  				uvm_pageactivate(p);
    > @@ -775,72 +814,11 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  				uvm_lock_pageq();
    >  				continue;
    >  			}
    > +			rw_exit(slock);
    >  
    > -			/*
    > -			 * if the page is swap-backed and dirty and swap space
    > -			 * is full, free any swap allocated to the page
    > -			 * so that other pages can be paged out.
    > -			 */
    > -			if ((p->pg_flags & PQ_SWAPBACKED) && uvm_swapisfilled())
    > -				uvmpd_dropswap(p);
    > -
    > -			/*
    > -			 * the page we are looking at is dirty.   we must
    > -			 * clean it before it can be freed.  to do this we
    > -			 * first mark the page busy so that no one else will
    > -			 * touch the page.   we write protect all the mappings
    > -			 * of the page so that no one touches it while it is
    > -			 * in I/O.
    > -			 */
    > -
    > -			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
    > -			atomic_setbits_int(&p->pg_flags, PG_BUSY);
    > -			UVM_PAGE_OWN(p, "scan_inactive");
    > -			pmap_page_protect(p, PROT_READ);
    > -			uvmexp.pgswapout++;
    > -
    > -			/*
    > -			 * for swap-backed pages we need to (re)allocate
    > -			 * swap space.
    > -			 */
    > -			if (swap_backed) {
    > -				/* free old swap slot (if any) */
    > -				uvmpd_dropswap(p);
    > -
    > -				/* start new cluster (if necessary) */
    > -				if (swapcluster_allocslots(&swc)) {
    > -					atomic_clearbits_int(&p->pg_flags,
    > -					    PG_BUSY);
    > -					UVM_PAGE_OWN(p, NULL);
    > -					dirtyreacts++;
    > -					uvm_unlock_pageq();
    > -					uvm_pageactivate(p);
    > -					rw_exit(slock);
    > -					uvm_lock_pageq();
    > -					continue;
    > -				}
    > -
    > -				/* add block to cluster */
    > -				if (swapcluster_add(&swc, p)) {
    > -					atomic_clearbits_int(&p->pg_flags,
    > -					    PG_BUSY);
    > -					UVM_PAGE_OWN(p, NULL);
    > -					dirtyreacts++;
    > -					uvm_unlock_pageq();
    > -					uvm_pageactivate(p);
    > -					rw_exit(slock);
    > -					uvm_lock_pageq();
    > -					continue;
    > -				}
    > -				rw_exit(slock);
    > -
    > -				/* cluster not full yet? */
    > -				if (swc.swc_nused < swc.swc_nallocated)
    > -					continue;
    > -			}
    > -		} else {
    > -			/* if p == NULL we must be doing a last swap i/o */
    > -			swap_backed = TRUE;
    > +			/* cluster not full yet? */
    > +			if (swc.swc_nused < swc.swc_nallocated)
    > +				continue;
    >  		}
    >  
    >  		/*
    > @@ -884,6 +862,18 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
    >  		}
    >  	}
    >  	TAILQ_REMOVE(pglst, &iter, pageq);
    > +
    > +	/* final swap-backed clustered pageout */
    > +	if (swc.swc_slot > 0) {
    > +		uvm_unlock_pageq();
    > +		npages = swc.swc_nused;
    > +		result = swapcluster_flush(&swc);
    > +		uvm_lock_pageq();
    > +		if (result == VM_PAGER_PEND) {
    > +			atomic_add_int(&uvmexp.paging, npages);
    > +			uvmexp.pdpending++;
    > +		}
    > +	}
    >  
    >  	return freed;
    >  }
    
    
  • Alexander Bluhm:

    Make pdaemon understandable