Download raw body.
Swap retries & OOM
On 08/11/24(Fri) 11:27, Martin Pieuchot wrote:
> The page daemon always tries to build a 64K cluster of pages to
> write to the swap partition. When one of the many uvm_swap_io()
> allocations fails and returns VM_PAGER_AGAIN, the pager layer,
> re-try with a single page.
>
> Since r2k22 there is a pre-allocated 64K cluster of pages in swap
> layer. If it is already used, the page daemon will wait for it
> to be released by the aiodoned thread.
>
> In fact this pre-allocated cluster and the sleep associated to it,
> is what makes the page daemon release the KERNEL_LOCK() and allows
> the aiodoned thread to release pages written to disk. Without this
> the page daemon enters an infinite loop holding the KERNEL_LOCK()
> and never allows the aiodoned to release pages.
>
> So I'd like to kill this dead code. This is a step towards cleanup
> the uvm_pager_* layer a requirement to clean the uvnode layer.
I've been asked off-list why this is dead code. Here is why:
VM_PAGER_AGAIN is returned by uvm_swap_put() when an allocation failed.
There is at most 3 allocations in uvm_swap_io():
- buffer descriptor for the VFS
- kva mapping for the pages
- (optionally) cluster-size pages for bouncing
In case bouncing is necessary, the page daemon is using, or waiting for,
a pre-allocated 64K buffer in uvm_swap_allocpages(). So the remaining
allocations are independent from the size of the cluster being written
to disk. That's why retrying with a smaller cluster isn't more likely
to succeed.
> Index: uvm/uvm_pager.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
> diff -u -p -r1.92 uvm_pager.c
> --- uvm/uvm_pager.c 24 Jul 2024 12:18:10 -0000 1.92
> +++ uvm/uvm_pager.c 8 Nov 2024 10:11:25 -0000
> @@ -520,7 +520,6 @@ uvm_pager_put(struct uvm_object *uobj, s
> * now attempt the I/O. if we have a failure and we are
> * clustered, we will drop the cluster and try again.
> */
> -ReTry:
> if (uobj) {
> result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
> } else {
> @@ -564,48 +563,34 @@ ReTry:
> * "swblk" (for transient errors, so we can retry),
> * or 0 (for hard errors).
> */
> - if (uobj == NULL && pg != NULL) {
> - /* XXX daddr_t -> int */
> - int nswblk = (result == VM_PAGER_AGAIN) ? swblk : 0;
> - if (pg->pg_flags & PQ_ANON) {
> - rw_enter(pg->uanon->an_lock, RW_WRITE);
> - pg->uanon->an_swslot = nswblk;
> - rw_exit(pg->uanon->an_lock);
> - } else {
> - rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> - uao_set_swslot(pg->uobject,
> - pg->offset >> PAGE_SHIFT,
> - nswblk);
> - rw_exit(pg->uobject->vmobjlock);
> - }
> - }
> - if (result == VM_PAGER_AGAIN) {
> - /*
> - * for transient failures, free all the swslots that
> - * we're not going to retry with.
> - */
> - if (uobj == NULL) {
> - if (pg) {
> - /* XXX daddr_t -> int */
> - uvm_swap_free(swblk + 1, *npages - 1);
> + if (uobj == NULL) {
> + if (pg != NULL) {
> + if (pg->pg_flags & PQ_ANON) {
> + rw_enter(pg->uanon->an_lock, RW_WRITE);
> + pg->uanon->an_swslot = 0;
> + rw_exit(pg->uanon->an_lock);
> } else {
> - /* XXX daddr_t -> int */
> - uvm_swap_free(swblk, *npages);
> + rw_enter(pg->uobject->vmobjlock, RW_WRITE);
> + uao_set_swslot(pg->uobject,
> + pg->offset >> PAGE_SHIFT, 0);
> + rw_exit(pg->uobject->vmobjlock);
> }
> }
> - if (pg) {
> - ppsp[0] = pg;
> - *npages = 1;
> - goto ReTry;
> - }
> - } else if (uobj == NULL) {
> /*
> - * for hard errors on swap-backed pageouts,
> - * mark the swslots as bad. note that we do not
> - * free swslots that we mark bad.
> + * for transient failures, free all the swslots
> */
> - /* XXX daddr_t -> int */
> - uvm_swap_markbad(swblk, *npages);
> + if (result == VM_PAGER_AGAIN) {
> + /* XXX daddr_t -> int */
> + uvm_swap_free(swblk, *npages);
> + } else {
> + /*
> + * for hard errors on swap-backed pageouts,
> + * mark the swslots as bad. note that we do not
> + * free swslots that we mark bad.
> + */
> + /* XXX daddr_t -> int */
> + uvm_swap_markbad(swblk, *npages);
> + }
> }
> }
>
> @@ -614,7 +599,6 @@ ReTry:
> * was one). give up! the caller only has one page ("pg")
> * to worry about.
> */
> -
> return result;
> }
>
> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -r1.129 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129
> +++ uvm/uvm_pdaemon.c 8 Nov 2024 09:34:31 -0000
> @@ -859,8 +859,6 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> if (result != VM_PAGER_AGAIN)
> uvm_pageactivate(p);
> pmap_clear_reference(p);
> - /* XXXCDC: if (swap_backed) FREE p's
> - * swap block? */
> } else {
> /* pageout was a success... */
> pmap_clear_reference(p);
>
Swap retries & OOM