From: Martin Pieuchot <mpi@grenadille.net>
Subject: mtx_enter() fix for huge systems
To: tech@openbsd.org
Date: Mon, 19 May 2025 16:28:31 +0200

Last May the SCHED_LOCK() has been converted to a mutex.  This change
exposed, on big arm64 machine like the Ampere Altra, a flaw in the MI
mutex (spinlock).  Then in July the weakness got partially fixed by
using ARMv8.1 LSE instructions such as CAS.  Sadly the issue is still
present and can hang the machine as soon as a given mutex is too
contended.

Diff below contains two changes to mitigate cache line contention and
with it my 80 CPUs Ampere Altra is now stable:

- Always read the value just before issuing an atomic operation.  It
  might have changed during splraise() and we want to avoid locking
  the cache line.  A similar change has been made recently in rwlocks.

- Change the busy loop when waiting for a contended lock.  Instead of
  wasting a single "busy cycle" in each iteration, increase the number
  of cycles exponentially.  The min & max numbers for this exponential
  backoff have been chosen carefully to not waste more cycles than in
  -current.  I even observed a small reduction of %sys time on my amd64
  with 24 CPUs.

This diff also removes the MP_LOCKDEBUG goo for mutexes.  I believe that
nowadays it creates more false positives than it is useful.  So I didn't
want to adapt the code to the new busy loop.  I'd argue that we should
get rid of the option completely and build something else like lockstat(8).

As Claudio mentioned earlier, I agree and I would welcome an MCS locks
implementation for kernel mutexes.  That said, the change below should
be enough to unstuck my uvm_purge() diff and has benefit on its own.

ok?

Index: kern/kern_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_lock.c,v
diff -u -p -r1.75 kern_lock.c
--- kern/kern_lock.c	3 Jul 2024 01:36:50 -0000	1.75
+++ kern/kern_lock.c	19 May 2025 14:19:02 -0000
@@ -42,6 +42,21 @@ int __mp_lock_spinout = INT_MAX;
 struct __mp_lock kernel_lock;
 
 /*
+ * Min & max numbers of "busy cycles" to waste before trying again to
+ * acquire a contended lock using an atomic operation.
+ *
+ * The min number must be as small as possible to not introduce extra
+ * latency.  It also doesn't matter if the first steps of an exponential
+ * backoff are smalls.
+ *
+ * The max number is used to cap the exponential backoff.  It should
+ * be small enough to not waste too many cycles in %sys time and big
+ * enough to reduce (ideally avoid) cache line contention.
+ */
+#define CPU_MIN_BUSY_CYCLES	1
+#define CPU_MAX_BUSY_CYCLES	64
+
+/*
  * Functions for manipulating the kernel_lock.  We put them here
  * so that they show up in profiles.
  */
@@ -228,9 +243,7 @@ void
 mtx_enter(struct mutex *mtx)
 {
 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
-#ifdef MP_LOCKDEBUG
-	int nticks = __mp_lock_spinout;
-#endif
+	unsigned int i, ncycle = CPU_MIN_BUSY_CYCLES;
 
 	WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx),
 	    LOP_EXCLUSIVE | LOP_NEWORDER, NULL);
@@ -238,15 +251,11 @@ mtx_enter(struct mutex *mtx)
 	spc->spc_spinning++;
 	while (mtx_enter_try(mtx) == 0) {
 		do {
-			CPU_BUSY_CYCLE();
-#ifdef MP_LOCKDEBUG
-			if (--nticks == 0) {
-				db_printf("%s: %p lock spun out\n",
-				    __func__, mtx);
-				db_enter();
-				nticks = __mp_lock_spinout;
-			}
-#endif
+			/* Busy loop with exponential backoff. */
+			for (i = ncycle; i > 0; i--)
+				CPU_BUSY_CYCLE();
+			if (ncycle < CPU_MAX_BUSY_CYCLES)
+				ncycle += ncycle;
 		} while (mtx->mtx_owner != NULL);
 	}
 	spc->spc_spinning--;
@@ -265,20 +274,27 @@ mtx_enter_try(struct mutex *mtx)
 	if (mtx->mtx_wantipl != IPL_NONE)
 		s = splraise(mtx->mtx_wantipl);
 
-	owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci);
+	/*
+	 * Avoid unconditional atomic operation to prevent cache line
+	 * contention.
+	 */
+	owner = mtx->mtx_owner;
 #ifdef DIAGNOSTIC
 	if (__predict_false(owner == ci))
 		panic("mtx %p: locking against myself", mtx);
 #endif
 	if (owner == NULL) {
-		membar_enter_after_atomic();
-		if (mtx->mtx_wantipl != IPL_NONE)
-			mtx->mtx_oldipl = s;
+		owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci);
+		if (owner == NULL) {
+			membar_enter_after_atomic();
+			if (mtx->mtx_wantipl != IPL_NONE)
+				mtx->mtx_oldipl = s;
 #ifdef DIAGNOSTIC
-		ci->ci_mutex_level++;
+			ci->ci_mutex_level++;
 #endif
-		WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE);
-		return (1);
+			WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE);
+			return (1);
+		}
 	}
 
 	if (mtx->mtx_wantipl != IPL_NONE)
@@ -353,6 +369,7 @@ void
 db_mtx_enter(struct db_mutex *mtx)
 {
 	struct cpu_info *ci = curcpu(), *owner;
+	unsigned int i, ncycle = CPU_MIN_BUSY_CYCLES;
 	unsigned long s;
 
 #ifdef DIAGNOSTIC
@@ -361,12 +378,22 @@ db_mtx_enter(struct db_mutex *mtx)
 #endif
 
 	s = intr_disable();
-
 	for (;;) {
-		owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci);
-		if (owner == NULL)
-			break;
-		CPU_BUSY_CYCLE();
+		/*
+		 * Avoid unconditional atomic operation to prevent cache
+		 * line contention.
+		 */
+		owner = mtx->mtx_owner;
+		if (owner == NULL) {
+			owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci);
+			if (owner == NULL)
+				break;
+			/* Busy loop with exponential backoff. */
+			for (i = ncycle; i > 0; i--)
+				CPU_BUSY_CYCLE();
+			if (ncycle < CPU_MAX_BUSY_CYCLES)
+				ncycle += ncycle;
+		}
 	}
 	membar_enter_after_atomic();