From: Martin Pieuchot <mpi@grenadille.net>
Subject: Re: Swap retries & OOM
To: tech@openbsd.org
Date: Wed, 20 Nov 2024 10:17:32 +0100

On 09/11/24(Sat) 11:37, Martin Pieuchot wrote:
> On 08/11/24(Fri) 11:27, Martin Pieuchot wrote:
> > The page daemon always tries to build a 64K cluster of pages to
> > write to the swap partition.  When one of the many uvm_swap_io()
> > allocations fails and returns VM_PAGER_AGAIN, the pager layer,
> > re-try with a single page.
> > 
> > Since r2k22 there is a pre-allocated 64K cluster of pages in swap
> > layer.   If it is already used, the page daemon will wait for it
> > to be released by the aiodoned thread.
> > 
> > In fact this pre-allocated cluster and the sleep associated to it,
> > is what makes the page daemon release the KERNEL_LOCK() and allows
> > the aiodoned thread to release pages written to disk.  Without this
> > the page daemon enters an infinite loop holding the KERNEL_LOCK()
> > and never allows the aiodoned to release pages.
> > 
> > So I'd like to kill this dead code.  This is a step towards cleanup
> > the uvm_pager_* layer a requirement to clean the uvnode layer.
> 
> I've been asked off-list why this is dead code.  Here is why:
> 
> VM_PAGER_AGAIN is returned by uvm_swap_put() when an allocation failed.
> There is at most 3 allocations in uvm_swap_io():
> 
> - buffer descriptor for the VFS
> - kva mapping for the pages
> - (optionally) cluster-size pages for bouncing 
> 
> In case bouncing is necessary, the page daemon is using, or waiting for,
> a pre-allocated 64K buffer in uvm_swap_allocpages().  So the remaining
> allocations are independent from the size of the cluster being written
> to disk.  That's why retrying with a smaller cluster isn't more likely
> to succeed.

Anyone?

> 
> > Index: uvm/uvm_pager.c
> > ===================================================================
> > RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
> > diff -u -p -r1.92 uvm_pager.c
> > --- uvm/uvm_pager.c	24 Jul 2024 12:18:10 -0000	1.92
> > +++ uvm/uvm_pager.c	8 Nov 2024 10:11:25 -0000
> > @@ -520,7 +520,6 @@ uvm_pager_put(struct uvm_object *uobj, s
> >  	 * now attempt the I/O.   if we have a failure and we are
> >  	 * clustered, we will drop the cluster and try again.
> >  	 */
> > -ReTry:
> >  	if (uobj) {
> >  		result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
> >  	} else {
> > @@ -564,48 +563,34 @@ ReTry:
> >  		 * "swblk" (for transient errors, so we can retry),
> >  		 * or 0 (for hard errors).
> >  		 */
> > -		if (uobj == NULL && pg != NULL) {
> > -			/* XXX daddr_t -> int */
> > -			int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0;
> > -			if (pg->pg_flags & PQ_ANON) {
> > -				rw_enter(pg->uanon->an_lock, RW_WRITE);
> > -				pg->uanon->an_swslot = nswblk;
> > -				rw_exit(pg->uanon->an_lock);
> > -			} else {
> > -				rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> > -				uao_set_swslot(pg->uobject,
> > -					       pg->offset >> PAGE_SHIFT,
> > -					       nswblk);
> > -				rw_exit(pg->uobject->vmobjlock);
> > -			}
> > -		}
> > -		if (result == VM_PAGER_AGAIN) {
> > -			/*
> > -			 * for transient failures, free all the swslots that
> > -			 * we're not going to retry with.
> > -			 */
> > -			if (uobj == NULL) {
> > -				if (pg) {
> > -					/* XXX daddr_t -> int */
> > -					uvm_swap_free(swblk + 1, *npages - 1);
> > +		if (uobj == NULL) {
> > +			if (pg != NULL) {
> > +				if (pg->pg_flags & PQ_ANON) {
> > +					rw_enter(pg->uanon->an_lock, RW_WRITE);
> > +					pg->uanon->an_swslot = 0;
> > +					rw_exit(pg->uanon->an_lock);
> >  				} else {
> > -					/* XXX daddr_t -> int */
> > -					uvm_swap_free(swblk, *npages);
> > +					rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> > +					uao_set_swslot(pg->uobject,
> > +					    pg->offset >> PAGE_SHIFT, 0);
> > +					rw_exit(pg->uobject->vmobjlock);
> >  				}
> >  			}
> > -			if (pg) {
> > -				ppsp[0] = pg;
> > -				*npages = 1;
> > -				goto ReTry;
> > -			}
> > -		} else if (uobj == NULL) {
> >  			/*
> > -			 * for hard errors on swap-backed pageouts,
> > -			 * mark the swslots as bad.  note that we do not
> > -			 * free swslots that we mark bad.
> > +			 * for transient failures, free all the swslots
> >  			 */
> > -			/* XXX daddr_t -> int */
> > -			uvm_swap_markbad(swblk, *npages);
> > +			if (result == VM_PAGER_AGAIN) {
> > +				/* XXX daddr_t -> int */
> > +				uvm_swap_free(swblk, *npages);
> > +			} else {
> > +				/*
> > +				 * for hard errors on swap-backed pageouts,
> > +				 * mark the swslots as bad.  note that we do not
> > +				 * free swslots that we mark bad.
> > +				 */
> > +				/* XXX daddr_t -> int */
> > +				uvm_swap_markbad(swblk, *npages);
> > +			}
> >  		}
> >  	}
> >  
> > @@ -614,7 +599,6 @@ ReTry:
> >  	 * was one).    give up!   the caller only has one page ("pg")
> >  	 * to worry about.
> >  	 */
> > -	
> >  	return result;
> >  }
> >  
> > Index: uvm/uvm_pdaemon.c
> > ===================================================================
> > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> > diff -u -p -r1.129 uvm_pdaemon.c
> > --- uvm/uvm_pdaemon.c	7 Nov 2024 10:46:52 -0000	1.129
> > +++ uvm/uvm_pdaemon.c	8 Nov 2024 09:34:31 -0000
> > @@ -859,8 +859,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> >  					if (result != VM_PAGER_AGAIN)
> >  						uvm_pageactivate(p);
> >  					pmap_clear_reference(p);
> > -					/* XXXCDC: if (swap_backed) FREE p's
> > -					 * swap block? */
> >  				} else {
> >  					/* pageout was a success... */
> >  					pmap_clear_reference(p);
> > 
>