Download raw body.
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profile a
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi
On Sun, Dec 21, 2025 at 9:15 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> I. kernel lock implementation
>
> Of course the fact that the lock exists in the first place and the
> extent to which it is used are a problem in their own right.
>
> But even then, performance is below what it can be in that setting.
>
> There are 2 angles to it:
>
> 1. it is implemented as a ticket lock, but these don't scale as there is
> a centralized place getting constantly read
> 2. even with that in mind performance is further affected by
> false-sharing of the state:
>
> struct __mp_lock_cpu {
> u_int mplc_ticket;
> u_int mplc_depth;
> };
>
> struct __mp_lock {
> struct __mp_lock_cpu mpl_cpus[MAXCPUS];
> volatile u_int mpl_ticket;
> u_int mpl_users;
> #ifdef WITNESS
> struct lock_object mpl_lock_obj;
> #endif
> };
>
> Assuming given __mpl_lock is 64-byte aligned, this means 8 cpus have
> their state in the same cacheline.
>
> Out of curiosity I implemented it on top of Anderson's lock instead and
> managed to get a nice scalability speed up in microbenchmarks for little
> effort. The patch is a POC and is included for reference, I have no
> interest in productizing it and I'm not claiming this is the best
> choice -- perhaps CLH would be a better pick. Note MCS is more expensive
> to unlock. This is very much NUMA oblivious, but this is probably not a
> real factor for the time being in this context.
>
> In order to bench this I ran fstat in a loop over different files which
> globally serializes on the lock with 8 different processes doing the
> work. This is on amd64.
>
> Results are (ops/s):
> before: 2305501
> after: 5891611 (+155%)
>
> As a side note, sticking to the ticket lock but guaranteeing separate
> state locations bumps the rate to 3161182 ops/s.
>
>
Now that I look at it, I think the Anderson's lock is a perfectly sane choice to
replace the ticket lock for the foreseable future.
The implementation as below needs some cosmetic clean ups + a bit of
commentary.
It depends on MAXCPUS being a power of 2, which can be checked for at
compilation time and which to my grep is not a real problem.
The dependency stems from behavior of mplc_index on wraparound -- it has
to match the same value modulo MAXCPUS as non-wrapped counter. However,
worst case this can just get a branch to avoid the wraparound in the
first place.
The real question is what to do about CACHELINESIZE. Currently it is
defined in percpu.h, but that's clearly wrong. Perhaps cleaning that up
can be considered beyond the scope and this thing here can also just
define it to 64.
percpu.h can't be included due to immediate build failures.
I can do the tidy ups if there is interest in getting this in.
> POC Anderson's lock (warning: assumes power-of-2 MAXCPUS and 64-byte
> cachelines):
>
> diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
> index fe75b7ae895..2a491271509 100644
> --- a/sys/kern/kern_lock.c
> +++ b/sys/kern/kern_lock.c
> @@ -117,8 +117,10 @@ void
> ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> {
> memset(mpl->mpl_cpus, 0, sizeof(mpl->mpl_cpus));
> - mpl->mpl_users = 0;
> - mpl->mpl_ticket = 1;
> + mpl->mpl_locks[0].taken = false;
> + for (int i = 1; i < MAXCPUS; i++)
> + mpl->mpl_locks[i].taken = true;
> + mpl->mpl_index = 0;
>
> #ifdef WITNESS
> mpl->mpl_lock_obj.lo_name = type->lt_name;
> @@ -131,7 +133,7 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> }
>
> static __inline void
> -__mp_lock_spin(struct __mp_lock *mpl, u_int me)
> +__mp_lock_spin(struct __mp_lock *mpl, u_int index)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> #ifdef MP_LOCKDEBUG
> @@ -139,7 +141,7 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> #endif
>
> spc->spc_spinning++;
> - while (mpl->mpl_ticket != me) {
> + while (READ_ONCE(mpl->mpl_locks[index].taken)) {
> CPU_BUSY_CYCLE();
>
> #ifdef MP_LOCKDEBUG
> @@ -151,6 +153,8 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> #endif
> }
> spc->spc_spinning--;
> + struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()];
> + cpu->mplc_owned = true;
> }
>
> void
> @@ -167,11 +171,12 @@ __mp_lock(struct __mp_lock *mpl)
>
> s = intr_disable();
> if (cpu->mplc_depth++ == 0)
> - cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users);
> + cpu->mplc_index = (atomic_inc_int_nv(&mpl->mpl_index) - 1) % MAXCPUS;
> intr_restore(s);
>
> - __mp_lock_spin(mpl, cpu->mplc_ticket);
> + __mp_lock_spin(mpl, cpu->mplc_index);
> membar_enter_after_atomic();
> +// WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
>
> WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE);
> }
> @@ -194,7 +199,9 @@ __mp_unlock(struct __mp_lock *mpl)
> s = intr_disable();
> if (--cpu->mplc_depth == 0) {
> membar_exit();
> - mpl->mpl_ticket++;
> + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> + cpu->mplc_owned = false;
> }
> intr_restore(s);
> }
> @@ -217,7 +224,9 @@ __mp_release_all(struct __mp_lock *mpl)
> #endif
> cpu->mplc_depth = 0;
> membar_exit();
> - mpl->mpl_ticket++;
> + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> + cpu->mplc_owned = false;
> intr_restore(s);
>
> return (rv);
> @@ -235,7 +244,7 @@ __mp_lock_held(struct __mp_lock *mpl, struct cpu_info *ci)
> {
> struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[CPU_INFO_UNIT(ci)];
>
> - return (cpu->mplc_ticket == mpl->mpl_ticket && cpu->mplc_depth > 0);
> + return (cpu->mplc_owned);
> }
>
> #endif /* __USE_MI_MPLOCK */
> diff --git a/sys/sys/mplock.h b/sys/sys/mplock.h
> index b28ac240d94..9a6beb65658 100644
> --- a/sys/sys/mplock.h
> +++ b/sys/sys/mplock.h
> @@ -33,19 +33,26 @@
>
> #include <sys/_lock.h>
>
> +#define CACHELINESIZE 64 /* FIXME */
> +
> struct __mp_lock_cpu {
> - u_int mplc_ticket;
> + u_int mplc_index;
> u_int mplc_depth;
> -};
> + bool mplc_owned;
> +} __aligned(CACHELINESIZE);
> +
> +struct __mp_lock_l {
> + bool taken;
> +} __aligned(CACHELINESIZE);
>
> struct __mp_lock {
> struct __mp_lock_cpu mpl_cpus[MAXCPUS];
> - volatile u_int mpl_ticket;
> - u_int mpl_users;
> + struct __mp_lock_l mpl_locks[MAXCPUS];
> + volatile u_int mpl_index __aligned(CACHELINESIZE);
> #ifdef WITNESS
> struct lock_object mpl_lock_obj;
> #endif
> -};
> +} __aligned(CACHELINESIZE);
>
> void ___mp_lock_init(struct __mp_lock *, const struct lock_type *);
> void __mp_lock(struct __mp_lock *);
>
>
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profile a
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi