Index | Thread | Search

From:
Mateusz Guzik <mjguzik@gmail.com>
Subject:
[PATCH] convert mpl ticket lock to anderson's lock
To:
mpi@grenadille.net
Cc:
tech@openbsd.org, Mateusz Guzik <mjguzik@gmail.com>
Date:
Sat, 27 Dec 2025 06:24:36 +0100

Download raw body.

Thread
  • Mateusz Guzik:

    [PATCH] convert mpl ticket lock to anderson's lock

Example microbenchmark doing fstat in 8 threads (ops/s):
before: 2305501
after: 5891611 (+155%)

Booted with WITNESS et al, survived several kernel builds no problem.

Needs more testing of course.

---
 sys/kern/kern_lock.c | 60 ++++++++++++++++++++++++++++++++++++--------
 sys/sys/mplock.h     | 17 +++++++++----
 2 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
index fe75b7ae895..81b8b950c05 100644
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@@ -109,16 +109,42 @@ _kernel_lock_held(void)
 
 #ifdef __USE_MI_MPLOCK
 
-/* Ticket lock implementation */
+/*
+ * Anderson's lock implementation
+ *
+ * This is a stop-gap primitive providing FIFO ordering while damage-controlling
+ * performance loss under contention.
+ *
+ * BUGS:
+ * - there is no awareness of virtualized environments, where lock owner or one
+ *   of the waiters can be on a vcpu preempted by the host, leading to time
+ *   waste spinning instead of yielding
+ * - poor NUMA performance, to be addressed after it becomes a factor -- at the
+ *   moment the kernel does not have NUMA-awareness in the first place
+ *
+ * Incomplete list of potential alternatives:
+ * - ticket lock -- centralized spinning, bad scalability
+ * - CLH -- viable other choice, not picked since Anderson's lock was good enough.
+ * - MCS -- less problematic on NUMA as CPUs retain their nodes, but this comes at
+ *          a cost of more expensive unlock
+ */
 
 #include <machine/cpu.h>
 
+/*
+ * Making sure MAXCPUS is a power of 2 allows mplc_index to wrap around to 0
+ * and still have the expected value.
+ */
+CTASSERT(powerof2(MAXCPUS));
+
 void
 ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
 {
 	memset(mpl->mpl_cpus, 0, sizeof(mpl->mpl_cpus));
-	mpl->mpl_users = 0;
-	mpl->mpl_ticket = 1;
+	mpl->mpl_locks[0].taken = false;
+	for (int i = 1; i < MAXCPUS; i++)
+		mpl->mpl_locks[i].taken = true;
+	mpl->mpl_index = 0;
 
 #ifdef WITNESS
 	mpl->mpl_lock_obj.lo_name = type->lt_name;
@@ -131,15 +157,21 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
 }
 
 static __inline void
-__mp_lock_spin(struct __mp_lock *mpl, u_int me)
+__mp_lock_spin(struct __mp_lock *mpl, u_int index)
 {
+	struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()];
 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
 #ifdef MP_LOCKDEBUG
 	long nticks = __mp_lock_spinout;
 #endif
 
+	if (!READ_ONCE(mpl->mpl_locks[index].taken)) {
+		cpu->mplc_owned = true;
+		return;
+	}
+
 	spc->spc_spinning++;
-	while (mpl->mpl_ticket != me) {
+	do {
 		CPU_BUSY_CYCLE();
 
 #ifdef MP_LOCKDEBUG
@@ -149,8 +181,10 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
 			nticks = __mp_lock_spinout;
 		}
 #endif
-	}
+	} while (READ_ONCE(mpl->mpl_locks[index].taken));
+
 	spc->spc_spinning--;
+	cpu->mplc_owned = true;
 }
 
 void
@@ -167,10 +201,10 @@ __mp_lock(struct __mp_lock *mpl)
 
 	s = intr_disable();
 	if (cpu->mplc_depth++ == 0)
-		cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users);
+		cpu->mplc_index = (atomic_inc_int_nv(&mpl->mpl_index) - 1) % MAXCPUS;
 	intr_restore(s);
 
-	__mp_lock_spin(mpl, cpu->mplc_ticket);
+	__mp_lock_spin(mpl, cpu->mplc_index);
 	membar_enter_after_atomic();
 
 	WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE);
@@ -193,8 +227,10 @@ __mp_unlock(struct __mp_lock *mpl)
 
 	s = intr_disable();
 	if (--cpu->mplc_depth == 0) {
+		WRITE_ONCE(mpl->mpl_locks[cpu->mplc_index].taken, true);
 		membar_exit();
-		mpl->mpl_ticket++;
+		WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
+		cpu->mplc_owned = false;
 	}
 	intr_restore(s);
 }
@@ -216,8 +252,10 @@ __mp_release_all(struct __mp_lock *mpl)
 		WITNESS_UNLOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE);
 #endif
 	cpu->mplc_depth = 0;
+	WRITE_ONCE(mpl->mpl_locks[cpu->mplc_index].taken, true);
 	membar_exit();
-	mpl->mpl_ticket++;
+	WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
+	cpu->mplc_owned = false;
 	intr_restore(s);
 
 	return (rv);
@@ -235,7 +273,7 @@ __mp_lock_held(struct __mp_lock *mpl, struct cpu_info *ci)
 {
 	struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[CPU_INFO_UNIT(ci)];
 
-	return (cpu->mplc_ticket == mpl->mpl_ticket && cpu->mplc_depth > 0);
+	return (cpu->mplc_owned);
 }
 
 #endif /* __USE_MI_MPLOCK */
diff --git a/sys/sys/mplock.h b/sys/sys/mplock.h
index b28ac240d94..ccbc257bf14 100644
--- a/sys/sys/mplock.h
+++ b/sys/sys/mplock.h
@@ -33,19 +33,26 @@
 
 #include <sys/_lock.h>
 
+#define CACHELINESIZE 64 /* FIXME */
+
 struct __mp_lock_cpu {
-	u_int			mplc_ticket;
+	u_int			mplc_index;
 	u_int			mplc_depth;
-};
+	bool			mplc_owned;
+} __aligned(CACHELINESIZE);
+
+struct __mp_lock_state {
+	bool			taken;
+} __aligned(CACHELINESIZE);
 
 struct __mp_lock {
 	struct __mp_lock_cpu	mpl_cpus[MAXCPUS];
-	volatile u_int		mpl_ticket;
-	u_int			mpl_users;
+	struct __mp_lock_state	mpl_locks[MAXCPUS];
+	volatile u_int		mpl_index __aligned(CACHELINESIZE);
 #ifdef WITNESS
 	struct lock_object	mpl_lock_obj;
 #endif
-};
+} __aligned(CACHELINESIZE);
 
 void	___mp_lock_init(struct __mp_lock *, const struct lock_type *);
 void	__mp_lock(struct __mp_lock *);
-- 
2.44.0