From: Martin Pieuchot <mpi@grenadille.net>
Subject: Re: [POC] performance loss due to inefficiency of kernel lock implementation + kernel build profile analysis
To: Mateusz Guzik <mjguzik@gmail.com>
Cc: tech@openbsd.org
Date: Wed, 24 Dec 2025 11:30:59 +0100

On 24/12/25(Wed) 01:26, Mateusz Guzik wrote:
> On Sun, Dec 21, 2025 at 9:15 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > I. kernel lock implementation
> >
> > Of course the fact that the lock exists in the first place and the
> > extent to which it is used are a problem in their own right.
> >
> > But even then, performance is below what it can be in that setting.
> >
> > There are 2 angles to it:
> >
> > 1. it is implemented as a ticket lock, but these don't scale as there is
> >    a centralized place getting constantly read
> > 2. even with that in mind performance is further affected by
> >    false-sharing of the state:
> >
> > struct __mp_lock_cpu {
> >         u_int                   mplc_ticket;
> >         u_int                   mplc_depth;
> > };
> >
> > struct __mp_lock {
> >         struct __mp_lock_cpu    mpl_cpus[MAXCPUS];
> >         volatile u_int          mpl_ticket;
> >         u_int                   mpl_users;
> > #ifdef WITNESS
> >         struct lock_object      mpl_lock_obj;
> > #endif
> > };
> >
> > Assuming given __mpl_lock is 64-byte aligned, this means 8 cpus have
> > their state in the same cacheline.
> >
> > Out of curiosity I implemented it on top of Anderson's lock instead and
> > managed to get a nice scalability speed up in microbenchmarks for little
> > effort. The patch is a POC and is included for reference, I have no
> > interest in productizing it and I'm not claiming this is the best
> > choice -- perhaps CLH would be a better pick. Note MCS is more expensive
> > to unlock. This is very much NUMA oblivious, but this is probably not a
> > real factor for the time being in this context.
> >
> > In order to bench this I ran fstat in a loop over different files which
> > globally serializes on the lock with 8 different processes doing the
> > work. This is on amd64.
> >
> > Results are (ops/s):
> > before: 2305501
> > after: 5891611 (+155%)
> >
> > As a side note, sticking to the ticket lock but guaranteeing separate
> > state locations bumps the rate to 3161182 ops/s.
> >
> >
> 
> Now that I look at it, I think the Anderson's lock is a perfectly sane choice to
> replace the ticket lock for the foreseable future.
> 
> The implementation as below needs some cosmetic clean ups + a bit of
> commentary.
> 
> It depends on MAXCPUS being a power of 2, which can be checked for at
> compilation time and which to my grep is not a real problem.
> 
> The dependency stems from behavior of mplc_index on wraparound -- it has
> to match the same value modulo MAXCPUS as non-wrapped counter. However,
> worst case this can just get a branch to avoid the wraparound in the
> first place.
> 
> The real question is what to do about CACHELINESIZE. Currently it is
> defined in percpu.h, but that's clearly wrong. Perhaps cleaning that up
> can be considered beyond the scope and this thing here can also just
> define it to 64.
> 
> percpu.h can't be included due to immediate build failures.
> 
> I can do the tidy ups if there is interest in getting this in.

I'd appreciate such diff.  Thanks.

> > POC Anderson's lock (warning: assumes power-of-2 MAXCPUS and 64-byte
> > cachelines):
> >
> > diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
> > index fe75b7ae895..2a491271509 100644
> > --- a/sys/kern/kern_lock.c
> > +++ b/sys/kern/kern_lock.c
> > @@ -117,8 +117,10 @@ void
> >  ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> >  {
> >         memset(mpl->mpl_cpus, 0, sizeof(mpl->mpl_cpus));
> > -       mpl->mpl_users = 0;
> > -       mpl->mpl_ticket = 1;
> > +       mpl->mpl_locks[0].taken = false;
> > +       for (int i = 1; i < MAXCPUS; i++)
> > +               mpl->mpl_locks[i].taken = true;
> > +       mpl->mpl_index = 0;
> >
> >  #ifdef WITNESS
> >         mpl->mpl_lock_obj.lo_name = type->lt_name;
> > @@ -131,7 +133,7 @@ ___mp_lock_init(struct __mp_lock *mpl, const struct lock_type *type)
> >  }
> >
> >  static __inline void
> > -__mp_lock_spin(struct __mp_lock *mpl, u_int me)
> > +__mp_lock_spin(struct __mp_lock *mpl, u_int index)
> >  {
> >         struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> >  #ifdef MP_LOCKDEBUG
> > @@ -139,7 +141,7 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> >  #endif
> >
> >         spc->spc_spinning++;
> > -       while (mpl->mpl_ticket != me) {
> > +       while (READ_ONCE(mpl->mpl_locks[index].taken)) {
> >                 CPU_BUSY_CYCLE();
> >
> >  #ifdef MP_LOCKDEBUG
> > @@ -151,6 +153,8 @@ __mp_lock_spin(struct __mp_lock *mpl, u_int me)
> >  #endif
> >         }
> >         spc->spc_spinning--;
> > +       struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()];
> > +       cpu->mplc_owned = true;
> >  }
> >
> >  void
> > @@ -167,11 +171,12 @@ __mp_lock(struct __mp_lock *mpl)
> >
> >         s = intr_disable();
> >         if (cpu->mplc_depth++ == 0)
> > -               cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users);
> > +               cpu->mplc_index = (atomic_inc_int_nv(&mpl->mpl_index) - 1) % MAXCPUS;
> >         intr_restore(s);
> >
> > -       __mp_lock_spin(mpl, cpu->mplc_ticket);
> > +       __mp_lock_spin(mpl, cpu->mplc_index);
> >         membar_enter_after_atomic();
> > +//     WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> >
> >         WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE);
> >  }
> > @@ -194,7 +199,9 @@ __mp_unlock(struct __mp_lock *mpl)
> >         s = intr_disable();
> >         if (--cpu->mplc_depth == 0) {
> >                 membar_exit();
> > -               mpl->mpl_ticket++;
> > +               WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> > +               WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> > +               cpu->mplc_owned = false;
> >         }
> >         intr_restore(s);
> >  }
> > @@ -217,7 +224,9 @@ __mp_release_all(struct __mp_lock *mpl)
> >  #endif
> >         cpu->mplc_depth = 0;
> >         membar_exit();
> > -       mpl->mpl_ticket++;
> > +       WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index + 1) % MAXCPUS].taken, false);
> > +       WRITE_ONCE(mpl->mpl_locks[(cpu->mplc_index)].taken, true);
> > +       cpu->mplc_owned = false;
> >         intr_restore(s);
> >
> >         return (rv);
> > @@ -235,7 +244,7 @@ __mp_lock_held(struct __mp_lock *mpl, struct cpu_info *ci)
> >  {
> >         struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[CPU_INFO_UNIT(ci)];
> >
> > -       return (cpu->mplc_ticket == mpl->mpl_ticket && cpu->mplc_depth > 0);
> > +       return (cpu->mplc_owned);
> >  }
> >
> >  #endif /* __USE_MI_MPLOCK */
> > diff --git a/sys/sys/mplock.h b/sys/sys/mplock.h
> > index b28ac240d94..9a6beb65658 100644
> > --- a/sys/sys/mplock.h
> > +++ b/sys/sys/mplock.h
> > @@ -33,19 +33,26 @@
> >
> >  #include <sys/_lock.h>
> >
> > +#define CACHELINESIZE 64 /* FIXME */
> > +
> >  struct __mp_lock_cpu {
> > -       u_int                   mplc_ticket;
> > +       u_int                   mplc_index;
> >         u_int                   mplc_depth;
> > -};
> > +       bool                    mplc_owned;
> > +} __aligned(CACHELINESIZE);
> > +
> > +struct __mp_lock_l {
> > +       bool                    taken;
> > +} __aligned(CACHELINESIZE);
> >
> >  struct __mp_lock {
> >         struct __mp_lock_cpu    mpl_cpus[MAXCPUS];
> > -       volatile u_int          mpl_ticket;
> > -       u_int                   mpl_users;
> > +       struct __mp_lock_l      mpl_locks[MAXCPUS];
> > +       volatile u_int          mpl_index __aligned(CACHELINESIZE);
> >  #ifdef WITNESS
> >         struct lock_object      mpl_lock_obj;
> >  #endif
> > -};
> > +} __aligned(CACHELINESIZE);
> >
> >  void   ___mp_lock_init(struct __mp_lock *, const struct lock_type *);
> >  void   __mp_lock(struct __mp_lock *);
> >
> >
>