Index | Thread | Search

From:
Martin Pieuchot <mpi@grenadille.net>
Subject:
Re: Fix swapping for MP archs.
To:
tech@openbsd.org
Date:
Wed, 20 Nov 2024 10:17:48 +0100

Download raw body.

Thread
On 09/11/24(Sat) 13:23, Martin Pieuchot wrote:
> The current swapping algorithm is broken for MP architectures.  However
> the bouncing logic, and it sleeping point, in uvm_swap_io() hides it.
> 
> In order to see the degenerative behavior you need:
> 
> - A MP machine with more than 2 CPUs
> - No high/low memory split and a lot of memory
> - Disable swap encryption (sysctl vm.swapencrypt.enable=0)
> 
> When all these criteria are met the kernel ends up doing ping-pong for
> every buffer between the page daemon, the aiodone daemon and the disk
> interrupt handler, all currently needing the KERNEL_LOCK().
> 
> To fix this degenerative behavior, where the page daemon is starving the 
> aiodone daemon and interrupt handler, we need to account for in-flight
> pages.  Diff below does that by accounting for pages being currently
> written to disk in the "shortage" calculation.
> 
> This diff makes swapping works as expected, no degenerative behavior, no
> over-swapping on MP machine with the above mentioned specification.  I
> believe it finally fixes swapping for 64 bits archs with a ton of memory.
> 
> Note that I deliberately did not use atomic_load_int() for reading
> `uvmexp.paging'.  I do not want to introduce too many differences with
> NetBSD for the moment and only a single thread (the page daemon) is
> reading the counter.
> 
> This diff is a requirement for in-place encryption which, obviously,
> does not use bouncing.
> 
> While here, release the KERNEL_LOCK() in the aiodone daemon when not
> needed.

ok?

> Index: uvm/uvm_pdaemon.c
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
> diff -u -p -r1.129 uvm_pdaemon.c
> --- uvm/uvm_pdaemon.c	7 Nov 2024 10:46:52 -0000	1.129
> +++ uvm/uvm_pdaemon.c	9 Nov 2024 11:17:29 -0000
> @@ -232,7 +232,7 @@ uvm_pageout(void *arg)
>  		long size;
>  
>  		uvm_lock_fpageq();
> -		if (TAILQ_EMPTY(&uvm.pmr_control.allocs)) {
> +		if (TAILQ_EMPTY(&uvm.pmr_control.allocs) || uvmexp.paging > 0) {
>  			msleep_nsec(&uvm.pagedaemon, &uvm.fpageqlock, PVM,
>  			    "pgdaemon", INFSLP);
>  			uvmexp.pdwoke++;
> @@ -245,7 +245,8 @@ uvm_pageout(void *arg)
>  			constraint = no_constraint;
>  		}
>  		/* How many pages do we need to free during this round? */
> -		shortage = uvmexp.freetarg - uvmexp.free + BUFPAGES_DEFICIT;
> +		shortage = uvmexp.freetarg -
> +		    (uvmexp.free + uvmexp.paging) + BUFPAGES_DEFICIT;
>  		uvm_unlock_fpageq();
>  
>  		/*
> @@ -302,8 +303,7 @@ uvm_pageout(void *arg)
>  		 * wake up any waiters.
>  		 */
>  		uvm_lock_fpageq();
> -		if (uvmexp.free > uvmexp.reserve_kernel ||
> -		    uvmexp.paging == 0) {
> +		if (uvmexp.free > uvmexp.reserve_kernel || uvmexp.paging == 0) {
>  			wakeup(&uvmexp.free);
>  		}
>  
> @@ -339,10 +339,11 @@ uvm_pageout(void *arg)
>  void
>  uvm_aiodone_daemon(void *arg)
>  {
> -	int s, free;
> +	int s, npages;
>  	struct buf *bp, *nbp;
>  
>  	uvm.aiodoned_proc = curproc;
> +	KERNEL_UNLOCK();
>  
>  	for (;;) {
>  		/*
> @@ -357,11 +358,13 @@ uvm_aiodone_daemon(void *arg)
>  		TAILQ_INIT(&uvm.aio_done);
>  		mtx_leave(&uvm.aiodoned_lock);
>  
> +
>  		/* process each i/o that's done. */
> -		free = uvmexp.free;
> +		npages = 0;
> +		KERNEL_LOCK();
>  		while (bp != NULL) {
>  			if (bp->b_flags & B_PDAEMON) {
> -				uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
> +				npages += bp->b_bufsize >> PAGE_SHIFT;
>  			}
>  			nbp = TAILQ_NEXT(bp, b_freelist);
>  			s = splbio();	/* b_iodone must by called at splbio */
> @@ -371,8 +374,11 @@ uvm_aiodone_daemon(void *arg)
>  
>  			sched_pause(yield);
>  		}
> +		KERNEL_UNLOCK();
> +
>  		uvm_lock_fpageq();
> -		wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
> +		atomic_add_int(&uvmexp.paging, -npages);
> +		wakeup(uvmexp.free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
>  		    &uvmexp.free);
>  		uvm_unlock_fpageq();
>  	}
> @@ -776,7 +782,7 @@ uvmpd_scan_inactive(struct uvm_pmalloc *
>  		 */
>  
>  		if (result == VM_PAGER_PEND) {
> -			uvmexp.paging += npages;
> +			atomic_add_int(&uvmexp.paging, npages);
>  			uvm_lock_pageq();
>  			uvmexp.pdpending++;
>  			if (p) {
> Index: uvm/uvmexp.h
> ===================================================================
> RCS file: /cvs/src/sys/uvm/uvmexp.h,v
> diff -u -p -r1.15 uvmexp.h
> --- uvm/uvmexp.h	1 May 2024 12:54:27 -0000	1.15
> +++ uvm/uvmexp.h	9 Nov 2024 11:11:44 -0000
> @@ -60,7 +60,7 @@ struct uvmexp {
>  	int free;       /* [F] number of free pages */
>  	int active;     /* [L] # of active pages */
>  	int inactive;   /* [L] # of pages that we free'd but may want back */
> -	int paging;	/* number of pages in the process of being paged out */
> +	int paging;	/* [a] # of pages in the process of being paged out */
>  	int wired;      /* number of wired pages */
>  
>  	int zeropages;		/* [F] number of zero'd pages */
> 
>