From: Martin Pieuchot Subject: Re: Fix swapping for MP archs. To: tech@openbsd.org Date: Wed, 20 Nov 2024 10:17:48 +0100 On 09/11/24(Sat) 13:23, Martin Pieuchot wrote: > The current swapping algorithm is broken for MP architectures. However > the bouncing logic, and it sleeping point, in uvm_swap_io() hides it. > > In order to see the degenerative behavior you need: > > - A MP machine with more than 2 CPUs > - No high/low memory split and a lot of memory > - Disable swap encryption (sysctl vm.swapencrypt.enable=0) > > When all these criteria are met the kernel ends up doing ping-pong for > every buffer between the page daemon, the aiodone daemon and the disk > interrupt handler, all currently needing the KERNEL_LOCK(). > > To fix this degenerative behavior, where the page daemon is starving the > aiodone daemon and interrupt handler, we need to account for in-flight > pages. Diff below does that by accounting for pages being currently > written to disk in the "shortage" calculation. > > This diff makes swapping works as expected, no degenerative behavior, no > over-swapping on MP machine with the above mentioned specification. I > believe it finally fixes swapping for 64 bits archs with a ton of memory. > > Note that I deliberately did not use atomic_load_int() for reading > `uvmexp.paging'. I do not want to introduce too many differences with > NetBSD for the moment and only a single thread (the page daemon) is > reading the counter. > > This diff is a requirement for in-place encryption which, obviously, > does not use bouncing. > > While here, release the KERNEL_LOCK() in the aiodone daemon when not > needed. ok? > Index: uvm/uvm_pdaemon.c > =================================================================== > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v > diff -u -p -r1.129 uvm_pdaemon.c > --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129 > +++ uvm/uvm_pdaemon.c 9 Nov 2024 11:17:29 -0000 > @@ -232,7 +232,7 @@ uvm_pageout(void *arg) > long size; > > uvm_lock_fpageq(); > - if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) { > + if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) { > msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM, > "pgdaemon", INFSLP); > uvmexp.pdwoke++; > @@ -245,7 +245,8 @@ uvm_pageout(void *arg) > constraint = no_constraint; > } > /* How many pages do we need to free during this round? */ > - shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT; > + shortage = uvmexp.freetarg - > + (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT; > uvm_unlock_fpageq(); > > /* > @@ -302,8 +303,7 @@ uvm_pageout(void *arg) > * wake up any waiters. > */ > uvm_lock_fpageq(); > - if (uvmexp.free > uvmexp.reserve_kernel || > - uvmexp.paging == 0) { > + if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) { > wakeup(&uvmexp.free); > } > > @@ -339,10 +339,11 @@ uvm_pageout(void *arg) > void > uvm_aiodone_daemon(void *arg) > { > - int s, free; > + int s, npages; > struct buf *bp, *nbp; > > uvm.aiodoned_proc = curproc; > + KERNEL_UNLOCK(); > > for (;;) { > /* > @@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg) > TAILQ_INIT(&uvm.aio_done); > mtx_leave(&uvm.aiodoned_lock); > > + > /* process each i/o that's done. */ > - free = uvmexp.free; > + npages = 0; > + KERNEL_LOCK(); > while (bp != NULL) { > if (bp->b_flags & B_PDAEMON) { > - uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT; > + npages += bp->b_bufsize >> PAGE_SHIFT; > } > nbp = TAILQ_NEXT(bp, b_freelist); > s = splbio(); /* b_iodone must by called at splbio */ > @@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg) > > sched_pause(yield); > } > + KERNEL_UNLOCK(); > + > uvm_lock_fpageq(); > - wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : > + atomic_add_int(&uvmexp.paging, -npages); > + wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : > &uvmexp.free); > uvm_unlock_fpageq(); > } > @@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * > */ > > if (result == VM_PAGER_PEND) { > - uvmexp.paging += npages; > + atomic_add_int(&uvmexp.paging, npages); > uvm_lock_pageq(); > uvmexp.pdpending++; > if (p) { > Index: uvm/uvmexp.h > =================================================================== > RCS file: /cvs/src/sys/uvm/uvmexp.h,v > diff -u -p -r1.15 uvmexp.h > --- uvm/uvmexp.h 1 May 2024 12:54:27 -0000 1.15 > +++ uvm/uvmexp.h 9 Nov 2024 11:11:44 -0000 > @@ -60,7 +60,7 @@ struct uvmexp { > int free; /* [F] number of free pages */ > int active; /* [L] # of active pages */ > int inactive; /* [L] # of pages that we free'd but may want back */ > - int paging; /* number of pages in the process of being paged out */ > + int paging; /* [a] # of pages in the process of being paged out */ > int wired; /* number of wired pages */ > > int zeropages; /* [F] number of zero'd pages */ > >