Download raw body.
Fix swapping for MP archs.
On 09/11/24(Sat) 13:23, Martin Pieuchot wrote:
> The current swapping algorithm is broken for MP architectures. However
> the bouncing logic, and it sleeping point, in uvm_swap_io() hides it.
>
> In order to see the degenerative behavior you need:
>
> - A MP machine with more than 2 CPUs
> - No high/low memory split and a lot of memory
> - Disable swap encryption (sysctl vm.swapencrypt.enable=0)
>
> When all these criteria are met the kernel ends up doing ping-pong for
> every buffer between the page daemon, the aiodone daemon and the disk
> interrupt handler, all currently needing the KERNEL_LOCK().
>
> To fix this degenerative behavior, where the page daemon is starving the
> aiodone daemon and interrupt handler, we need to account for in-flight
> pages. Diff below does that by accounting for pages being currently
> written to disk in the "shortage" calculation.
>
> This diff makes swapping works as expected, no degenerative behavior, no
> over-swapping on MP machine with the above mentioned specification. I
> believe it finally fixes swapping for 64 bits archs with a ton of memory.
>
> Note that I deliberately did not use atomic_load_int() for reading
> `uvmexp.paging'. I do not want to introduce too many differences with
> NetBSD for the moment and only a single thread (the page daemon) is
> reading the counter.
>
> This diff is a requirement for in-place encryption which, obviously,
> does not use bouncing.
>
> While here, release the KERNEL_LOCK() in the aiodone daemon when not
> needed.
ok?
> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -r1.129 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129
> +++ uvm/uvm_pdaemon.c 9 Nov 2024 11:17:29 -0000
> @@ -232,7 +232,7 @@ uvm_pageout(void *arg)
> long size;
>
> uvm_lock_fpageq();
> - if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) {
> + if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) {
> msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM,
> "pgdaemon", INFSLP);
> uvmexp.pdwoke++;
> @@ -245,7 +245,8 @@ uvm_pageout(void *arg)
> constraint = no_constraint;
> }
> /* How many pages do we need to free during this round? */
> - shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT;
> + shortage = uvmexp.freetarg -
> + (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT;
> uvm_unlock_fpageq();
>
> /*
> @@ -302,8 +303,7 @@ uvm_pageout(void *arg)
> * wake up any waiters.
> */
> uvm_lock_fpageq();
> - if (uvmexp.free > uvmexp.reserve_kernel ||
> - uvmexp.paging == 0) {
> + if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) {
> wakeup(&uvmexp.free);
> }
>
> @@ -339,10 +339,11 @@ uvm_pageout(void *arg)
> void
> uvm_aiodone_daemon(void *arg)
> {
> - int s, free;
> + int s, npages;
> struct buf *bp, *nbp;
>
> uvm.aiodoned_proc = curproc;
> + KERNEL_UNLOCK();
>
> for (;;) {
> /*
> @@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg)
> TAILQ_INIT(&uvm.aio_done);
> mtx_leave(&uvm.aiodoned_lock);
>
> +
> /* process each i/o that's done. */
> - free = uvmexp.free;
> + npages = 0;
> + KERNEL_LOCK();
> while (bp != NULL) {
> if (bp->b_flags & B_PDAEMON) {
> - uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
> + npages += bp->b_bufsize >> PAGE_SHIFT;
> }
> nbp = TAILQ_NEXT(bp, b_freelist);
> s = splbio(); /* b_iodone must by called at splbio */
> @@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg)
>
> sched_pause(yield);
> }
> + KERNEL_UNLOCK();
> +
> uvm_lock_fpageq();
> - wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
> + atomic_add_int(&uvmexp.paging, -npages);
> + wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
> &uvmexp.free);
> uvm_unlock_fpageq();
> }
> @@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
> */
>
> if (result == VM_PAGER_PEND) {
> - uvmexp.paging += npages;
> + atomic_add_int(&uvmexp.paging, npages);
> uvm_lock_pageq();
> uvmexp.pdpending++;
> if (p) {
> Index: uvm/uvmexp.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> diff -u -p -r1.15 uvmexp.h
> --- uvm/uvmexp.h 1 May 2024 12:54:27 -0000 1.15
> +++ uvm/uvmexp.h 9 Nov 2024 11:11:44 -0000
> @@ -60,7 +60,7 @@ struct uvmexp {
> int free; /* [F] number of free pages */
> int active; /* [L] # of active pages */
> int inactive; /* [L] # of pages that we free'd but may want back */
> - int paging; /* number of pages in the process of being paged out */
> + int paging; /* [a] # of pages in the process of being paged out */
> int wired; /* number of wired pages */
>
> int zeropages; /* [F] number of zero'd pages */
>
>
Fix swapping for MP archs.