From: Claudio Jeker Subject: Re: Fix swapping for MP archs. To: tech@openbsd.org Date: Wed, 20 Nov 2024 11:33:31 +0100 On Wed, Nov 20, 2024 at 10:17:48AM +0100, Martin Pieuchot wrote: > On 09/11/24(Sat) 13:23, Martin Pieuchot wrote: > > The current swapping algorithm is broken for MP architectures. However > > the bouncing logic, and it sleeping point, in uvm_swap_io() hides it. > > > > In order to see the degenerative behavior you need: > > > > - A MP machine with more than 2 CPUs > > - No high/low memory split and a lot of memory > > - Disable swap encryption (sysctl vm.swapencrypt.enable=0) > > > > When all these criteria are met the kernel ends up doing ping-pong for > > every buffer between the page daemon, the aiodone daemon and the disk > > interrupt handler, all currently needing the KERNEL_LOCK(). > > > > To fix this degenerative behavior, where the page daemon is starving the > > aiodone daemon and interrupt handler, we need to account for in-flight > > pages. Diff below does that by accounting for pages being currently > > written to disk in the "shortage" calculation. > > > > This diff makes swapping works as expected, no degenerative behavior, no > > over-swapping on MP machine with the above mentioned specification. I > > believe it finally fixes swapping for 64 bits archs with a ton of memory. > > > > Note that I deliberately did not use atomic_load_int() for reading > > `uvmexp.paging'. I do not want to introduce too many differences with > > NetBSD for the moment and only a single thread (the page daemon) is > > reading the counter. > > > > This diff is a requirement for in-place encryption which, obviously, > > does not use bouncing. > > > > While here, release the KERNEL_LOCK() in the aiodone daemon when not > > needed. > > ok? Some bikeshed comments below: > > Index: uvm/uvm_pdaemon.c > > =================================================================== > > RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v > > diff -u -p -r1.129 uvm_pdaemon.c > > --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129 > > +++ uvm/uvm_pdaemon.c 9 Nov 2024 11:17:29 -0000 > > @@ -232,7 +232,7 @@ uvm_pageout(void *arg) > > long size; > > > > uvm_lock_fpageq(); > > - if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) { > > + if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) { > > msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM, > > "pgdaemon", INFSLP); > > uvmexp.pdwoke++; > > @@ -245,7 +245,8 @@ uvm_pageout(void *arg) > > constraint = no_constraint; > > } > > /* How many pages do we need to free during this round? */ > > - shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT; > > + shortage = uvmexp.freetarg - > > + (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT; > > uvm_unlock_fpageq(); > > > > /* > > @@ -302,8 +303,7 @@ uvm_pageout(void *arg) > > * wake up any waiters. > > */ > > uvm_lock_fpageq(); > > - if (uvmexp.free > uvmexp.reserve_kernel || > > - uvmexp.paging == 0) { > > + if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) { > > wakeup(&uvmexp.free); > > } > > > > @@ -339,10 +339,11 @@ uvm_pageout(void *arg) > > void > > uvm_aiodone_daemon(void *arg) > > { > > - int s, free; > > + int s, npages; > > struct buf *bp, *nbp; > > > > uvm.aiodoned_proc = curproc; > > + KERNEL_UNLOCK(); > > > > for (;;) { > > /* > > @@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg) > > TAILQ_INIT(&uvm.aio_done); > > mtx_leave(&uvm.aiodoned_lock); > > > > + Extra new line. > > /* process each i/o that's done. */ > > - free = uvmexp.free; > > + npages = 0; > > + KERNEL_LOCK(); > > while (bp != NULL) { > > if (bp->b_flags & B_PDAEMON) { > > - uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT; > > + npages += bp->b_bufsize >> PAGE_SHIFT; > > } > > nbp = TAILQ_NEXT(bp, b_freelist); > > s = splbio(); /* b_iodone must by called at splbio */ > > @@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg) > > > > sched_pause(yield); > > } > > + KERNEL_UNLOCK(); > > + > > uvm_lock_fpageq(); > > - wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : > > + atomic_add_int(&uvmexp.paging, -npages); Why not use atomic_sub_int here? > > + wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : > > &uvmexp.free); Ugh. Not your issue but wakeup with a ternary operator looks crazy. > > uvm_unlock_fpageq(); > > } > > @@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * > > */ > > > > if (result == VM_PAGER_PEND) { > > - uvmexp.paging += npages; > > + atomic_add_int(&uvmexp.paging, npages); > > uvm_lock_pageq(); > > uvmexp.pdpending++; > > if (p) { > > Index: uvm/uvmexp.h > > =================================================================== > > RCS file: /cvs/src/sys/uvm/uvmexp.h,v > > diff -u -p -r1.15 uvmexp.h > > --- uvm/uvmexp.h 1 May 2024 12:54:27 -0000 1.15 > > +++ uvm/uvmexp.h 9 Nov 2024 11:11:44 -0000 > > @@ -60,7 +60,7 @@ struct uvmexp { > > int free; /* [F] number of free pages */ > > int active; /* [L] # of active pages */ > > int inactive; /* [L] # of pages that we free'd but may want back */ > > - int paging; /* number of pages in the process of being paged out */ > > + int paging; /* [a] # of pages in the process of being paged out */ > > int wired; /* number of wired pages */ > > > > int zeropages; /* [F] number of zero'd pages */ > > > > > > Diff looks good to me but I have no idea of UVM internals. -- :wq Claudio