From: "Martin Pieuchot" Subject: Fix swapping for MP archs. To: tech@openbsd.org Date: Sat, 09 Nov 2024 13:23:45 +0100 The current swapping algorithm is broken for MP architectures. However the bouncing logic, and it sleeping point, in uvm_swap_io() hides it. In order to see the degenerative behavior you need: - A MP machine with more than 2 CPUs - No high/low memory split and a lot of memory - Disable swap encryption (sysctl vm.swapencrypt.enable=0) When all these criteria are met the kernel ends up doing ping-pong for every buffer between the page daemon, the aiodone daemon and the disk interrupt handler, all currently needing the KERNEL_LOCK(). To fix this degenerative behavior, where the page daemon is starving the aiodone daemon and interrupt handler, we need to account for in-flight pages. Diff below does that by accounting for pages being currently written to disk in the "shortage" calculation. This diff makes swapping works as expected, no degenerative behavior, no over-swapping on MP machine with the above mentioned specification. I believe it finally fixes swapping for 64 bits archs with a ton of memory. Note that I deliberately did not use atomic_load_int() for reading `uvmexp.paging'. I do not want to introduce too many differences with NetBSD for the moment and only a single thread (the page daemon) is reading the counter. This diff is a requirement for in-place encryption which, obviously, does not use bouncing. While here, release the KERNEL_LOCK() in the aiodone daemon when not needed. ok? Index: uvm/uvm_pdaemon.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v diff -u -p -r1.129 uvm_pdaemon.c --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129 +++ uvm/uvm_pdaemon.c 9 Nov 2024 11:17:29 -0000 @@ -232,7 +232,7 @@ uvm_pageout(void *arg) long size; uvm_lock_fpageq(); - if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) { + if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) { msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM, "pgdaemon", INFSLP); uvmexp.pdwoke++; @@ -245,7 +245,8 @@ uvm_pageout(void *arg) constraint = no_constraint; } /* How many pages do we need to free during this round? */ - shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT; + shortage = uvmexp.freetarg - + (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT; uvm_unlock_fpageq(); /* @@ -302,8 +303,7 @@ uvm_pageout(void *arg) * wake up any waiters. */ uvm_lock_fpageq(); - if (uvmexp.free > uvmexp.reserve_kernel || - uvmexp.paging == 0) { + if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) { wakeup(&uvmexp.free); } @@ -339,10 +339,11 @@ uvm_pageout(void *arg) void uvm_aiodone_daemon(void *arg) { - int s, free; + int s, npages; struct buf *bp, *nbp; uvm.aiodoned_proc = curproc; + KERNEL_UNLOCK(); for (;;) { /* @@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg) TAILQ_INIT(&uvm.aio_done); mtx_leave(&uvm.aiodoned_lock); + /* process each i/o that's done. */ - free = uvmexp.free; + npages = 0; + KERNEL_LOCK(); while (bp != NULL) { if (bp->b_flags & B_PDAEMON) { - uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT; + npages += bp->b_bufsize >> PAGE_SHIFT; } nbp = TAILQ_NEXT(bp, b_freelist); s = splbio(); /* b_iodone must by called at splbio */ @@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg) sched_pause(yield); } + KERNEL_UNLOCK(); + uvm_lock_fpageq(); - wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : + atomic_add_int(&uvmexp.paging, -npages); + wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon : &uvmexp.free); uvm_unlock_fpageq(); } @@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc * */ if (result == VM_PAGER_PEND) { - uvmexp.paging += npages; + atomic_add_int(&uvmexp.paging, npages); uvm_lock_pageq(); uvmexp.pdpending++; if (p) { Index: uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v diff -u -p -r1.15 uvmexp.h --- uvm/uvmexp.h 1 May 2024 12:54:27 -0000 1.15 +++ uvm/uvmexp.h 9 Nov 2024 11:11:44 -0000 @@ -60,7 +60,7 @@ struct uvmexp { int free; /* [F] number of free pages */ int active; /* [L] # of active pages */ int inactive; /* [L] # of pages that we free'd but may want back */ - int paging; /* number of pages in the process of being paged out */ + int paging; /* [a] # of pages in the process of being paged out */ int wired; /* number of wired pages */ int zeropages; /* [F] number of zero'd pages */