Download raw body.
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profile a
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi
On 24/12/25(Wed) 01:26, Mateusz Guzik wrote:
> On Sun, Dec 21, 2025 at 9:15 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > I. kernel lock implementation
> >
> > Of course the fact that the lock exists in the first place and the
> > extent to which it is used are a problem in their own right.
> >
> > But even then, performance is below what it can be in that setting.
> >
> > There are 2 angles to it:
> >
> > 1. it is implemented as a ticket lock, but these don't scale as there is
> > a centralized place getting constantly read
> > 2. even with that in mind performance is further affected by
> > false-sharing of the state:
> >
> > struct __mp_lock_cpu {
> > u_int mplc_ticket;
> > u_int mplc_depth;
> > };
> >
> > struct __mp_lock {
> > struct __mp_lock_cpu mpl_cpus[MAXCPUS];
> > volatile u_int mpl_ticket;
> > u_int mpl_users;
> > #ifdef WITNESS
> > struct lock_object mpl_lock_obj;
> > #endif
> > };
> >
> > Assuming given __mpl_lock is 64-byte aligned, this means 8 cpus have
> > their state in the same cacheline.
> >
> > Out of curiosity I implemented it on top of Anderson's lock instead and
> > managed to get a nice scalability speed up in microbenchmarks for little
> > effort. The patch is a POC and is included for reference, I have no
> > interest in productizing it and I'm not claiming this is the best
> > choice -- perhaps CLH would be a better pick. Note MCS is more expensive
> > to unlock. This is very much NUMA oblivious, but this is probably not a
> > real factor for the time being in this context.
> >
> > In order to bench this I ran fstat in a loop over different files which
> > globally serializes on the lock with 8 different processes doing the
> > work. This is on amd64.
> >
> > Results are (ops/s):
> > before: 2305501
> > after: 5891611 (+155%)
> >
> > As a side note, sticking to the ticket lock but guaranteeing separate
> > state locations bumps the rate to 3161182 ops/s.
> >
> >
>
> Now that I look at it, I think the Anderson's lock is a perfectly sane choice to
> replace the ticket lock for the foreseable future.
>
> The implementation as below needs some cosmetic clean ups + a bit of
> commentary.
>
> It depends on MAXCPUS being a power of 2, which can be checked for at
> compilation time and which to my grep is not a real problem.
>
> The dependency stems from behavior of mplc_index on wraparound -- it has
> to match the same value modulo MAXCPUS as non-wrapped counter. However,
> worst case this can just get a branch to avoid the wraparound in the
> first place.
>
> The real question is what to do about CACHELINESIZE. Currently it is
> defined in percpu.h, but that's clearly wrong. Perhaps cleaning that up
> can be considered beyond the scope and this thing here can also just
> define it to 64.
>
> percpu.h can't be included due to immediate build failures.
>
> I can do the tidy ups if there is interest in getting this in.
I'd appreciate such diff. Thanks.
> > POC Anderson's lock (warning: assumes power-of-2 MAXCPUS and 64-byte
> > cachelines):
> >
> > diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
> > index fe75b7ae895..2a491271509 100644
> > --- a/sys/kern/kern_lock.c
> > +++ b/sys/kern/kern_lock.c
> > @@ -117,8 +117,10 @@ void
> > ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> > {
> > memset(mpl->mpl_cpus, 0, sizeof(mpl->mpl_cpus));
> > - mpl->mpl_users = 0;
> > - mpl->mpl_ticket = 1;
> > + mpl->mpl_locks[0].taken = false;
> > + for (int i = 1; i < MAXCPUS; i++)
> > + mpl->mpl_locks[i].taken = true;
> > + mpl->mpl_index = 0;
> >
> > #ifdef WITNESS
> > mpl->mpl_lock_obj.lo_name = type->lt_name;
> > @@ -131,7 +133,7 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> > }
> >
> > static __inline void
> > -__mp_lock_spin(struct __mp_lock *mpl, u_int me)
> > +__mp_lock_spin(struct __mp_lock *mpl, u_int index)
> > {
> > struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> > #ifdef MP_LOCKDEBUG
> > @@ -139,7 +141,7 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> > #endif
> >
> > spc->spc_spinning++;
> > - while (mpl->mpl_ticket != me) {
> > + while (READ_ONCE(mpl->mpl_locks[index].taken)) {
> > CPU_BUSY_CYCLE();
> >
> > #ifdef MP_LOCKDEBUG
> > @@ -151,6 +153,8 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> > #endif
> > }
> > spc->spc_spinning--;
> > + struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()];
> > + cpu->mplc_owned = true;
> > }
> >
> > void
> > @@ -167,11 +171,12 @@ __mp_lock(struct __mp_lock *mpl)
> >
> > s = intr_disable();
> > if (cpu->mplc_depth++ == 0)
> > - cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users);
> > + cpu->mplc_index = (atomic_inc_int_nv(&mpl->mpl_index) - 1) % MAXCPUS;
> > intr_restore(s);
> >
> > - __mp_lock_spin(mpl, cpu->mplc_ticket);
> > + __mp_lock_spin(mpl, cpu->mplc_index);
> > membar_enter_after_atomic();
> > +// WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> >
> > WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE);
> > }
> > @@ -194,7 +199,9 @@ __mp_unlock(struct __mp_lock *mpl)
> > s = intr_disable();
> > if (--cpu->mplc_depth == 0) {
> > membar_exit();
> > - mpl->mpl_ticket++;
> > + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> > + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> > + cpu->mplc_owned = false;
> > }
> > intr_restore(s);
> > }
> > @@ -217,7 +224,9 @@ __mp_release_all(struct __mp_lock *mpl)
> > #endif
> > cpu->mplc_depth = 0;
> > membar_exit();
> > - mpl->mpl_ticket++;
> > + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> > + WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> > + cpu->mplc_owned = false;
> > intr_restore(s);
> >
> > return (rv);
> > @@ -235,7 +244,7 @@ __mp_lock_held(struct __mp_lock *mpl, struct cpu_info *ci)
> > {
> > struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[CPU_INFO_UNIT(ci)];
> >
> > - return (cpu->mplc_ticket == mpl->mpl_ticket && cpu->mplc_depth > 0);
> > + return (cpu->mplc_owned);
> > }
> >
> > #endif /* __USE_MI_MPLOCK */
> > diff --git a/sys/sys/mplock.h b/sys/sys/mplock.h
> > index b28ac240d94..9a6beb65658 100644
> > --- a/sys/sys/mplock.h
> > +++ b/sys/sys/mplock.h
> > @@ -33,19 +33,26 @@
> >
> > #include <sys/_lock.h>
> >
> > +#define CACHELINESIZE 64 /* FIXME */
> > +
> > struct __mp_lock_cpu {
> > - u_int mplc_ticket;
> > + u_int mplc_index;
> > u_int mplc_depth;
> > -};
> > + bool mplc_owned;
> > +} __aligned(CACHELINESIZE);
> > +
> > +struct __mp_lock_l {
> > + bool taken;
> > +} __aligned(CACHELINESIZE);
> >
> > struct __mp_lock {
> > struct __mp_lock_cpu mpl_cpus[MAXCPUS];
> > - volatile u_int mpl_ticket;
> > - u_int mpl_users;
> > + struct __mp_lock_l mpl_locks[MAXCPUS];
> > + volatile u_int mpl_index __aligned(CACHELINESIZE);
> > #ifdef WITNESS
> > struct lock_object mpl_lock_obj;
> > #endif
> > -};
> > +} __aligned(CACHELINESIZE);
> >
> > void ___mp_lock_init(struct __mp_lock *, const struct lock_type *);
> > void __mp_lock(struct __mp_lock *);
> >
> >
>
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profile a
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi
[POC] performance loss due to inefficiency of kernel lock implementation + kernel build profi