Download raw body.
Swap retries & OOM
On 09/11/24(Sat) 11:37, Martin Pieuchot wrote:
> On 08/11/24(Fri) 11:27, Martin Pieuchot wrote:
> > The page daemon always tries to build a 64K cluster of pages to
> > write to the swap partition. When one of the many uvm_swap_io()
> > allocations fails and returns VM_PAGER_AGAIN, the pager layer,
> > re-try with a single page.
> >
> > Since r2k22 there is a pre-allocated 64K cluster of pages in swap
> > layer. If it is already used, the page daemon will wait for it
> > to be released by the aiodoned thread.
> >
> > In fact this pre-allocated cluster and the sleep associated to it,
> > is what makes the page daemon release the KERNEL_LOCK() and allows
> > the aiodoned thread to release pages written to disk. Without this
> > the page daemon enters an infinite loop holding the KERNEL_LOCK()
> > and never allows the aiodoned to release pages.
> >
> > So I'd like to kill this dead code. This is a step towards cleanup
> > the uvm_pager_* layer a requirement to clean the uvnode layer.
>
> I've been asked off-list why this is dead code. Here is why:
>
> VM_PAGER_AGAIN is returned by uvm_swap_put() when an allocation failed.
> There is at most 3 allocations in uvm_swap_io():
>
> - buffer descriptor for the VFS
> - kva mapping for the pages
> - (optionally) cluster-size pages for bouncing
>
> In case bouncing is necessary, the page daemon is using, or waiting for,
> a pre-allocated 64K buffer in uvm_swap_allocpages(). So the remaining
> allocations are independent from the size of the cluster being written
> to disk. That's why retrying with a smaller cluster isn't more likely
> to succeed.
Anyone?
>
> > Index: uvm/uvm_pager.c
> > ===================================================================
> > RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
> > diff -u -p -r1.92 uvm_pager.c
> > --- uvm/uvm_pager.c 24 Jul 2024 12:18:10 -0000 1.92
> > +++ uvm/uvm_pager.c 8 Nov 2024 10:11:25 -0000
> > @@ -520,7 +520,6 @@ uvm_pager_put(struct uvm_object *uobj, s
> > * now attempt the I/O. if we have a failure and we are
> > * clustered, we will drop the cluster and try again.
> > */
> > -ReTry:
> > if (uobj) {
> > result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
> > } else {
> > @@ -564,48 +563,34 @@ ReTry:
> > * "swblk" (for transient errors, so we can retry),
> > * or 0 (for hard errors).
> > */
> > - if (uobj == NULL && pg != NULL) {
> > - /* XXX daddr_t -> int */
> > - int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0;
> > - if (pg->pg_flags & PQ_ANON) {
> > - rw_enter(pg->uanon->an_lock, RW_WRITE);
> > - pg->uanon->an_swslot = nswblk;
> > - rw_exit(pg->uanon->an_lock);
> > - } else {
> > - rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> > - uao_set_swslot(pg->uobject,
> > - pg->offset >> PAGE_SHIFT,
> > - nswblk);
> > - rw_exit(pg->uobject->vmobjlock);
> > - }
> > - }
> > - if (result == VM_PAGER_AGAIN) {
> > - /*
> > - * for transient failures, free all the swslots that
> > - * we're not going to retry with.
> > - */
> > - if (uobj == NULL) {
> > - if (pg) {
> > - /* XXX daddr_t -> int */
> > - uvm_swap_free(swblk + 1, *npages - 1);
> > + if (uobj == NULL) {
> > + if (pg != NULL) {
> > + if (pg->pg_flags & PQ_ANON) {
> > + rw_enter(pg->uanon->an_lock, RW_WRITE);
> > + pg->uanon->an_swslot = 0;
> > + rw_exit(pg->uanon->an_lock);
> > } else {
> > - /* XXX daddr_t -> int */
> > - uvm_swap_free(swblk, *npages);
> > + rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> > + uao_set_swslot(pg->uobject,
> > + pg->offset >> PAGE_SHIFT, 0);
> > + rw_exit(pg->uobject->vmobjlock);
> > }
> > }
> > - if (pg) {
> > - ppsp[0] = pg;
> > - *npages = 1;
> > - goto ReTry;
> > - }
> > - } else if (uobj == NULL) {
> > /*
> > - * for hard errors on swap-backed pageouts,
> > - * mark the swslots as bad. note that we do not
> > - * free swslots that we mark bad.
> > + * for transient failures, free all the swslots
> > */
> > - /* XXX daddr_t -> int */
> > - uvm_swap_markbad(swblk, *npages);
> > + if (result == VM_PAGER_AGAIN) {
> > + /* XXX daddr_t -> int */
> > + uvm_swap_free(swblk, *npages);
> > + } else {
> > + /*
> > + * for hard errors on swap-backed pageouts,
> > + * mark the swslots as bad. note that we do not
> > + * free swslots that we mark bad.
> > + */
> > + /* XXX daddr_t -> int */
> > + uvm_swap_markbad(swblk, *npages);
> > + }
> > }
> > }
> >
> > @@ -614,7 +599,6 @@ ReTry:
> > * was one). give up! the caller only has one page ("pg")
> > * to worry about.
> > */
> > -
> > return result;
> > }
> >
> > Index: uvm/uvm_pdaemon.c
> > ===================================================================
> > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> > diff -u -p -r1.129 uvm_pdaemon.c
> > --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129
> > +++ uvm/uvm_pdaemon.c 8 Nov 2024 09:34:31 -0000
> > @@ -859,8 +859,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> > if (result != VM_PAGER_AGAIN)
> > uvm_pageactivate(p);
> > pmap_clear_reference(p);
> > - /* XXXCDC: if (swap_backed) FREE p's
> > - * swap block? */
> > } else {
> > /* pageout was a success... */
> > pmap_clear_reference(p);
> >
>
Swap retries & OOM