Download raw body.
avoid contention in thrsleep/thrwakeup
On Mon, Nov 25, 2024 at 4:18 PM David Gwynne <david@gwynne.id.au> wrote:
>
> turns out the __thrsleep and __thrwakeup syscalls largely coordinate
> using a single lock per process. if you have heavily threaded code
> building locks in userland out of thrsleep, this kernel lock gets
> hammered. this is true even if userland thinks it's operating on
> separate locks, it all ends up serialised in the kernel. this reduces
> the throughput of these heavily threaded programs, and combined
> with the pressure that rwlocks place on the scheduler, produces a lot of
> time spinning on the sched lock in the kernel.
>
> "but david", you say, "we don't use thrsleep these days, pthreads use
> futexes". that is true, but it turns out go uses __thrsleep and
> __thrwakeup as the backend for its locks. go programs also tend to use
> threads more than the average program, so they can see a big benefit
> from this diff. the reason i looked at this was someone was asking why
> go builds had so much system time against them.
>
> looking at thrsleep/thrwakeup, they remind me of futexes, so i
> applied most of the tweaks ive proposed to futexes here. i now
> believe that futexes are the product of linux NIH syndrome, but i
> also believe futexes are simpler. long term i'd lean toward deprecating
> thrsleep. until then we can make thrsleep go faster.
>
> the big change is hashing thrsleep waiters into an different
> locks/lists to try and avoid all locks in a process contending on
> a single lock. the hash is shared by all processes though.
>
> the diff also avoids having a waiter re-take the lock to avoid
> contention on the thrwakeup code which is currently holding the lock.
>
> tests are welcome. apart from go i don't know what really uses these
> syscalls. ive seen some impressive improvements to the build times for
> go itself, but the more tests the merrier.
tested on my ryzen 7950x, building go itself, both on the host
directly and in a vm. in a 16 core linux qemu/kvm vm, 'real' time as
measured by time in ksh improves greatly and a huge reduction in
system time.
before:
mischief@beast.home.arpa:~/src/go/src $ time ./make.bash
Building Go cmd/dist using /usr/local/go. (go1.23.1 openbsd/amd64)
Building Go toolchain1 using /usr/local/go.
Building Go bootstrap cmd/go (go_bootstrap) using Go toolchain1.
Building Go toolchain2 using go_bootstrap and Go toolchain1.
Building Go toolchain3 using go_bootstrap and Go toolchain2.
Building packages and commands for openbsd/amd64.
---
Installed Go for openbsd/amd64 in /home/mischief/src/go
Installed commands in /home/mischief/src/go/bin
4m01.30s real 2m45.09s user 39m18.46s system
with this change:
beast$ time ./make.bash
Building Go cmd/dist using /usr/local/go. (go1.23.1 openbsd/amd64)
Building Go toolchain1 using /usr/local/go.
Building Go bootstrap cmd/go (go_bootstrap) using Go toolchain1.
Building Go toolchain2 using go_bootstrap and Go toolchain1.
Building Go toolchain3 using go_bootstrap and Go toolchain2.
Building packages and commands for openbsd/amd64.
---
Installed Go for openbsd/amd64 in /home/mischief/src/go
Installed commands in /home/mischief/src/go/bin
*** You need to add /home/mischief/src/go/bin to your PATH.
1m00.09s real 2m42.33s user 3m16.70s system
>
> Index: kern_synch.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_synch.c,v
> diff -u -p -r1.210 kern_synch.c
> --- kern_synch.c 4 Nov 2024 22:41:50 -0000 1.210
> +++ kern_synch.c 25 Nov 2024 01:49:33 -0000
> @@ -63,8 +63,6 @@
> #endif
>
> int sleep_signal_check(struct proc *);
> -int thrsleep(struct proc *, struct sys___thrsleep_args *);
> -int thrsleep_unlock(void *);
>
> extern void proc_stop(struct proc *p, int);
>
> @@ -611,28 +609,66 @@ sys_sched_yield(struct proc *p, void *v,
> return (0);
> }
>
> -int
> -thrsleep_unlock(void *lock)
> +static inline int
> +thrsleep_unlock(_atomic_lock_t *atomiclock)
> {
> static _atomic_lock_t unlocked = _ATOMIC_LOCK_UNLOCKED;
> - _atomic_lock_t *atomiclock = lock;
>
> - if (!lock)
> + if (atomiclock == NULL)
> return 0;
>
> return copyout(&unlocked, atomiclock, sizeof(unlocked));
> }
>
> struct tslpentry {
> - TAILQ_ENTRY(tslpentry) tslp_link;
> - long tslp_ident;
> + TAILQ_ENTRY(tslpentry) tslp_link;
> + struct process *tslp_ps;
> + long tslp_ident;
> + struct proc *volatile tslp_p;
> };
>
> +struct tslp_bucket {
> + struct tslpqueue tsb_list;
> + struct mutex tsb_lock;
> +} __aligned(64);
> +
> /* thrsleep queue shared between processes */
> -static struct tslpqueue thrsleep_queue = TAILQ_HEAD_INITIALIZER(thrsleep_queue);
> -static struct rwlock thrsleep_lock = RWLOCK_INITIALIZER("thrsleeplk");
> +static struct tslp_bucket tsb_shared;
>
> -int
> +#define TSLP_BUCKET_BITS 6
> +#define TSLP_BUCKET_SIZE (1UL << TSLP_BUCKET_BITS)
> +#define TSLP_BUCKET_MASK (TSLP_BUCKET_SIZE - 1)
> +
> +static struct tslp_bucket tsb_buckets[TSLP_BUCKET_SIZE];
> +
> +void
> +tslp_init(void)
> +{
> + struct tslp_bucket *tsb;
> + size_t i;
> +
> + TAILQ_INIT(&tsb_shared.tsb_list);
> + mtx_init(&tsb_shared.tsb_lock, IPL_MPFLOOR);
> +
> + for (i = 0; i < nitems(tsb_buckets); i++) {
> + tsb = &tsb_buckets[i];
> +
> + TAILQ_INIT(&tsb->tsb_list);
> + mtx_init(&tsb->tsb_lock, IPL_MPFLOOR);
> + }
> +}
> +
> +static struct tslp_bucket *
> +thrsleep_bucket(long ident)
> +{
> + ident >>= 3;
> + ident ^= ident >> TSLP_BUCKET_BITS;
> + ident &= TSLP_BUCKET_MASK;
> +
> + return &tsb_buckets[ident];
> +}
> +
> +static int
> thrsleep(struct proc *p, struct sys___thrsleep_args *v)
> {
> struct sys___thrsleep_args /* {
> @@ -644,18 +680,19 @@ thrsleep(struct proc *p, struct sys___th
> } */ *uap = v;
> long ident = (long)SCARG(uap, ident);
> struct tslpentry entry;
> - struct tslpqueue *queue;
> - struct rwlock *qlock;
> + struct tslp_bucket *tsb;
> struct timespec *tsp = (struct timespec *)SCARG(uap, tp);
> void *lock = SCARG(uap, lock);
> - uint64_t nsecs = INFSLP;
> - int abort = 0, error;
> + const uint32_t *abortp = SCARG(uap, abort);
> clockid_t clock_id = SCARG(uap, clock_id);
> + uint64_t to_ticks = 0;
> + int error = 0;
>
> if (ident == 0)
> return (EINVAL);
> if (tsp != NULL) {
> struct timespec now;
> + uint64_t nsecs;
>
> if ((error = clock_gettime(p, clock_id, &now)))
> return (error);
> @@ -673,49 +710,62 @@ thrsleep(struct proc *p, struct sys___th
>
> timespecsub(tsp, &now, tsp);
> nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
> + to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
> + if (to_ticks > INT_MAX)
> + to_ticks = INT_MAX;
> }
>
> - if (ident == -1) {
> - queue = &thrsleep_queue;
> - qlock = &thrsleep_lock;
> - } else {
> - queue = &p->p_p->ps_tslpqueue;
> - qlock = &p->p_p->ps_lock;
> - }
> + tsb = (ident == -1) ? &tsb_shared : thrsleep_bucket(ident);
>
> /* Interlock with wakeup. */
> + entry.tslp_ps = p->p_p;
> entry.tslp_ident = ident;
> - rw_enter_write(qlock);
> - TAILQ_INSERT_TAIL(queue, &entry, tslp_link);
> - rw_exit_write(qlock);
> + entry.tslp_p = p;
> +
> + mtx_enter(&tsb->tsb_lock);
> + TAILQ_INSERT_TAIL(&tsb->tsb_list, &entry, tslp_link);
> + mtx_leave(&tsb->tsb_lock);
>
> error = thrsleep_unlock(lock);
> + if (error != 0)
> + goto leave;
>
> - if (error == 0 && SCARG(uap, abort) != NULL)
> - error = copyin(SCARG(uap, abort), &abort, sizeof(abort));
> + if (abortp != NULL) {
> + uint32_t abort;
> + error = copyin32(abortp, &abort);
> + if (error != 0)
> + goto leave;
> + if (abort) {
> + error = EINTR;
> + goto leave;
> + }
> + }
>
> - rw_enter_write(qlock);
> - if (error != 0)
> - goto out;
> - if (abort != 0) {
> - error = EINTR;
> - goto out;
> - }
> - if (entry.tslp_ident != 0) {
> - error = rwsleep_nsec(&entry, qlock, PWAIT|PCATCH, "thrsleep",
> - nsecs);
> - }
> -
> -out:
> - if (entry.tslp_ident != 0)
> - TAILQ_REMOVE(queue, &entry, tslp_link);
> - rw_exit_write(qlock);
> + sleep_setup(&entry, PWAIT|PCATCH, "thrsleep");
> + error = sleep_finish(to_ticks, entry.tslp_p != NULL);
> + if (error != 0 || entry.tslp_p != NULL) {
> + mtx_enter(&tsb->tsb_lock);
> + if (entry.tslp_p != NULL)
> + TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link);
> + else
> + error = 0;
> + mtx_leave(&tsb->tsb_lock);
>
> - if (error == ERESTART)
> - error = ECANCELED;
> + if (error == ERESTART)
> + error = ECANCELED;
> + }
>
> return (error);
>
> +leave:
> + if (entry.tslp_p != NULL) {
> + mtx_enter(&tsb->tsb_lock);
> + if (entry.tslp_p != NULL)
> + TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link);
> + mtx_leave(&tsb->tsb_lock);
> + }
> +
> + return (error);
> }
>
> int
> @@ -747,6 +797,21 @@ sys___thrsleep(struct proc *p, void *v,
> return 0;
> }
>
> +static void
> +tslp_wakeups(struct tslpqueue *tslpq)
> +{
> + struct tslpentry *entry, *nentry;
> + struct proc *p;
> +
> + SCHED_LOCK();
> + TAILQ_FOREACH_SAFE(entry, tslpq, tslp_link, nentry) {
> + p = entry->tslp_p;
> + entry->tslp_p = NULL;
> + wakeup_proc(p, 0);
> + }
> + SCHED_UNLOCK();
> +}
> +
> int
> sys___thrwakeup(struct proc *p, void *v, register_t *retval)
> {
> @@ -754,50 +819,53 @@ sys___thrwakeup(struct proc *p, void *v,
> syscallarg(const volatile void *) ident;
> syscallarg(int) n;
> } */ *uap = v;
> - struct tslpentry *entry, *tmp;
> - struct tslpqueue *queue;
> - struct rwlock *qlock;
> + struct tslpentry *entry, *nentry;
> + struct tslp_bucket *tsb;
> long ident = (long)SCARG(uap, ident);
> int n = SCARG(uap, n);
> int found = 0;
> + struct tslpqueue wq = TAILQ_HEAD_INITIALIZER(wq);
>
> - if (ident == 0)
> + if (ident == 0) {
> *retval = EINVAL;
> - else {
> - if (ident == -1) {
> - queue = &thrsleep_queue;
> - qlock = &thrsleep_lock;
> - /*
> - * Wake up all waiters with ident -1. This is needed
> - * because ident -1 can be shared by multiple userspace
> - * lock state machines concurrently. The implementation
> - * has no way to direct the wakeup to a particular
> - * state machine.
> - */
> - n = 0;
> - } else {
> - queue = &p->p_p->ps_tslpqueue;
> - qlock = &p->p_p->ps_lock;
> - }
> + return (0);
> + }
>
> - rw_enter_write(qlock);
> - TAILQ_FOREACH_SAFE(entry, queue, tslp_link, tmp) {
> - if (entry->tslp_ident == ident) {
> - TAILQ_REMOVE(queue, entry, tslp_link);
> - entry->tslp_ident = 0;
> - wakeup_one(entry);
> - if (++found == n)
> - break;
> - }
> - }
> - rw_exit_write(qlock);
> + if (ident == -1) {
> + /*
> + * Wake up all waiters with ident -1. This is needed
> + * because ident -1 can be shared by multiple userspace
> + * lock state machines concurrently. The implementation
> + * has no way to direct the wakeup to a particular
> + * state machine.
> + */
> + mtx_enter(&tsb_shared.tsb_lock);
> + tslp_wakeups(&tsb_shared.tsb_list);
> + TAILQ_INIT(&tsb_shared.tsb_list);
> + mtx_leave(&tsb_shared.tsb_lock);
>
> - if (ident == -1)
> - *retval = 0;
> - else
> - *retval = found ? 0 : ESRCH;
> + *retval = 0;
> + return (0);
> }
>
> + tsb = thrsleep_bucket(ident);
> +
> + mtx_enter(&tsb->tsb_lock);
> + TAILQ_FOREACH_SAFE(entry, &tsb->tsb_list, tslp_link, nentry) {
> + if (entry->tslp_ident == ident && entry->tslp_ps == p->p_p) {
> + TAILQ_REMOVE(&tsb->tsb_list, entry, tslp_link);
> + TAILQ_INSERT_TAIL(&wq, entry, tslp_link);
> +
> + if (++found == n)
> + break;
> + }
> + }
> +
> + if (found)
> + tslp_wakeups(&wq);
> + mtx_leave(&tsb->tsb_lock);
> +
> + *retval = found ? 0 : ESRCH;
> return (0);
> }
>
> Index: init_main.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_main.c,v
> diff -u -p -r1.326 init_main.c
> --- init_main.c 2 Apr 2024 08:39:16 -0000 1.326
> +++ init_main.c 25 Nov 2024 01:49:33 -0000
> @@ -140,6 +140,7 @@ void db_ctf_init(void);
> void prof_init(void);
> void init_exec(void);
> void futex_init(void);
> +void tslp_init(void);
> void taskq_init(void);
> void timeout_proc_init(void);
> void pool_gc_pages(void *);
> @@ -250,6 +251,7 @@ main(void *framep)
> * Initialize futexes.
> */
> futex_init();
> + tslp_init();
>
> /* Create credentials. */
> p->p_ucred = crget();
>
avoid contention in thrsleep/thrwakeup