Download raw body.
Fix swapping for MP archs.
The current swapping algorithm is broken for MP architectures. However
the bouncing logic, and it sleeping point, in uvm_swap_io() hides it.
In order to see the degenerative behavior you need:
- A MP machine with more than 2 CPUs
- No high/low memory split and a lot of memory
- Disable swap encryption (sysctl vm.swapencrypt.enable=0)
When all these criteria are met the kernel ends up doing ping-pong for
every buffer between the page daemon, the aiodone daemon and the disk
interrupt handler, all currently needing the KERNEL_LOCK().
To fix this degenerative behavior, where the page daemon is starving the
aiodone daemon and interrupt handler, we need to account for in-flight
pages. Diff below does that by accounting for pages being currently
written to disk in the "shortage" calculation.
This diff makes swapping works as expected, no degenerative behavior, no
over-swapping on MP machine with the above mentioned specification. I
believe it finally fixes swapping for 64 bits archs with a ton of memory.
Note that I deliberately did not use atomic_load_int() for reading
`uvmexp.paging'. I do not want to introduce too many differences with
NetBSD for the moment and only a single thread (the page daemon) is
reading the counter.
This diff is a requirement for in-place encryption which, obviously,
does not use bouncing.
While here, release the KERNEL_LOCK() in the aiodone daemon when not
needed.
ok?
Index: uvm/uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
diff -u -p -r1.129 uvm_pdaemon.c
--- uvm/uvm_pdaemon.c 7 Nov 2024 10:46:52 -0000 1.129
+++ uvm/uvm_pdaemon.c 9 Nov 2024 11:17:29 -0000
@@ -232,7 +232,7 @@ uvm_pageout(void *arg)
long size;
uvm_lock_fpageq();
- if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) {
+ if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) {
msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM,
"pgdaemon", INFSLP);
uvmexp.pdwoke++;
@@ -245,7 +245,8 @@ uvm_pageout(void *arg)
constraint = no_constraint;
}
/* How many pages do we need to free during this round? */
- shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT;
+ shortage = uvmexp.freetarg -
+ (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT;
uvm_unlock_fpageq();
/*
@@ -302,8 +303,7 @@ uvm_pageout(void *arg)
* wake up any waiters.
*/
uvm_lock_fpageq();
- if (uvmexp.free > uvmexp.reserve_kernel ||
- uvmexp.paging == 0) {
+ if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) {
wakeup(&uvmexp.free);
}
@@ -339,10 +339,11 @@ uvm_pageout(void *arg)
void
uvm_aiodone_daemon(void *arg)
{
- int s, free;
+ int s, npages;
struct buf *bp, *nbp;
uvm.aiodoned_proc = curproc;
+ KERNEL_UNLOCK();
for (;;) {
/*
@@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg)
TAILQ_INIT(&uvm.aio_done);
mtx_leave(&uvm.aiodoned_lock);
+
/* process each i/o that's done. */
- free = uvmexp.free;
+ npages = 0;
+ KERNEL_LOCK();
while (bp != NULL) {
if (bp->b_flags & B_PDAEMON) {
- uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
+ npages += bp->b_bufsize >> PAGE_SHIFT;
}
nbp = TAILQ_NEXT(bp, b_freelist);
s = splbio(); /* b_iodone must by called at splbio */
@@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg)
sched_pause(yield);
}
+ KERNEL_UNLOCK();
+
uvm_lock_fpageq();
- wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
+ atomic_add_int(&uvmexp.paging, -npages);
+ wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
&uvmexp.free);
uvm_unlock_fpageq();
}
@@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
*/
if (result == VM_PAGER_PEND) {
- uvmexp.paging += npages;
+ atomic_add_int(&uvmexp.paging, npages);
uvm_lock_pageq();
uvmexp.pdpending++;
if (p) {
Index: uvm/uvmexp.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvmexp.h,v
diff -u -p -r1.15 uvmexp.h
--- uvm/uvmexp.h 1 May 2024 12:54:27 -0000 1.15
+++ uvm/uvmexp.h 9 Nov 2024 11:11:44 -0000
@@ -60,7 +60,7 @@ struct uvmexp {
int free; /* [F] number of free pages */
int active; /* [L] # of active pages */
int inactive; /* [L] # of pages that we free'd but may want back */
- int paging; /* number of pages in the process of being paged out */
+ int paging; /* [a] # of pages in the process of being paged out */
int wired; /* number of wired pages */
int zeropages; /* [F] number of zero'd pages */
Fix swapping for MP archs.