From: David Gwynne Subject: avoid contention in thrsleep/thrwakeup To: tech@openbsd.org Date: Tue, 26 Nov 2024 10:11:33 +1000 turns out the __thrsleep and __thrwakeup syscalls largely coordinate using a single lock per process. if you have heavily threaded code building locks in userland out of thrsleep, this kernel lock gets hammered. this is true even if userland thinks it's operating on separate locks, it all ends up serialised in the kernel. this reduces the throughput of these heavily threaded programs, and combined with the pressure that rwlocks place on the scheduler, produces a lot of time spinning on the sched lock in the kernel. "but david", you say, "we don't use thrsleep these days, pthreads use futexes". that is true, but it turns out go uses __thrsleep and __thrwakeup as the backend for its locks. go programs also tend to use threads more than the average program, so they can see a big benefit from this diff. the reason i looked at this was someone was asking why go builds had so much system time against them. looking at thrsleep/thrwakeup, they remind me of futexes, so i applied most of the tweaks ive proposed to futexes here. i now believe that futexes are the product of linux NIH syndrome, but i also believe futexes are simpler. long term i'd lean toward deprecating thrsleep. until then we can make thrsleep go faster. the big change is hashing thrsleep waiters into an different locks/lists to try and avoid all locks in a process contending on a single lock. the hash is shared by all processes though. the diff also avoids having a waiter re-take the lock to avoid contention on the thrwakeup code which is currently holding the lock. tests are welcome. apart from go i don't know what really uses these syscalls. ive seen some impressive improvements to the build times for go itself, but the more tests the merrier. Index: kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v diff -u -p -r1.210 kern_synch.c --- kern_synch.c 4 Nov 2024 22:41:50 -0000 1.210 +++ kern_synch.c 25 Nov 2024 01:49:33 -0000 @@ -63,8 +63,6 @@ #endif int sleep_signal_check(struct proc *); -int thrsleep(struct proc *, struct sys___thrsleep_args *); -int thrsleep_unlock(void *); extern void proc_stop(struct proc *p, int); @@ -611,28 +609,66 @@ sys_sched_yield(struct proc *p, void *v, return (0); } -int -thrsleep_unlock(void *lock) +static inline int +thrsleep_unlock(_atomic_lock_t *atomiclock) { static _atomic_lock_t unlocked = _ATOMIC_LOCK_UNLOCKED; - _atomic_lock_t *atomiclock = lock; - if (!lock) + if (atomiclock == NULL) return 0; return copyout(&unlocked, atomiclock, sizeof(unlocked)); } struct tslpentry { - TAILQ_ENTRY(tslpentry) tslp_link; - long tslp_ident; + TAILQ_ENTRY(tslpentry) tslp_link; + struct process *tslp_ps; + long tslp_ident; + struct proc *volatile tslp_p; }; +struct tslp_bucket { + struct tslpqueue tsb_list; + struct mutex tsb_lock; +} __aligned(64); + /* thrsleep queue shared between processes */ -static struct tslpqueue thrsleep_queue = TAILQ_HEAD_INITIALIZER(thrsleep_queue); -static struct rwlock thrsleep_lock = RWLOCK_INITIALIZER("thrsleeplk"); +static struct tslp_bucket tsb_shared; -int +#define TSLP_BUCKET_BITS 6 +#define TSLP_BUCKET_SIZE (1UL << TSLP_BUCKET_BITS) +#define TSLP_BUCKET_MASK (TSLP_BUCKET_SIZE - 1) + +static struct tslp_bucket tsb_buckets[TSLP_BUCKET_SIZE]; + +void +tslp_init(void) +{ + struct tslp_bucket *tsb; + size_t i; + + TAILQ_INIT(&tsb_shared.tsb_list); + mtx_init(&tsb_shared.tsb_lock, IPL_MPFLOOR); + + for (i = 0; i < nitems(tsb_buckets); i++) { + tsb = &tsb_buckets[i]; + + TAILQ_INIT(&tsb->tsb_list); + mtx_init(&tsb->tsb_lock, IPL_MPFLOOR); + } +} + +static struct tslp_bucket * +thrsleep_bucket(long ident) +{ + ident >>= 3; + ident ^= ident >> TSLP_BUCKET_BITS; + ident &= TSLP_BUCKET_MASK; + + return &tsb_buckets[ident]; +} + +static int thrsleep(struct proc *p, struct sys___thrsleep_args *v) { struct sys___thrsleep_args /* { @@ -644,18 +680,19 @@ thrsleep(struct proc *p, struct sys___th } */ *uap = v; long ident = (long)SCARG(uap, ident); struct tslpentry entry; - struct tslpqueue *queue; - struct rwlock *qlock; + struct tslp_bucket *tsb; struct timespec *tsp = (struct timespec *)SCARG(uap, tp); void *lock = SCARG(uap, lock); - uint64_t nsecs = INFSLP; - int abort = 0, error; + const uint32_t *abortp = SCARG(uap, abort); clockid_t clock_id = SCARG(uap, clock_id); + uint64_t to_ticks = 0; + int error = 0; if (ident == 0) return (EINVAL); if (tsp != NULL) { struct timespec now; + uint64_t nsecs; if ((error = clock_gettime(p, clock_id, &now))) return (error); @@ -673,49 +710,62 @@ thrsleep(struct proc *p, struct sys___th timespecsub(tsp, &now, tsp); nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); + to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; + if (to_ticks > INT_MAX) + to_ticks = INT_MAX; } - if (ident == -1) { - queue = &thrsleep_queue; - qlock = &thrsleep_lock; - } else { - queue = &p->p_p->ps_tslpqueue; - qlock = &p->p_p->ps_lock; - } + tsb = (ident == -1) ? &tsb_shared : thrsleep_bucket(ident); /* Interlock with wakeup. */ + entry.tslp_ps = p->p_p; entry.tslp_ident = ident; - rw_enter_write(qlock); - TAILQ_INSERT_TAIL(queue, &entry, tslp_link); - rw_exit_write(qlock); + entry.tslp_p = p; + + mtx_enter(&tsb->tsb_lock); + TAILQ_INSERT_TAIL(&tsb->tsb_list, &entry, tslp_link); + mtx_leave(&tsb->tsb_lock); error = thrsleep_unlock(lock); + if (error != 0) + goto leave; - if (error == 0 && SCARG(uap, abort) != NULL) - error = copyin(SCARG(uap, abort), &abort, sizeof(abort)); + if (abortp != NULL) { + uint32_t abort; + error = copyin32(abortp, &abort); + if (error != 0) + goto leave; + if (abort) { + error = EINTR; + goto leave; + } + } - rw_enter_write(qlock); - if (error != 0) - goto out; - if (abort != 0) { - error = EINTR; - goto out; - } - if (entry.tslp_ident != 0) { - error = rwsleep_nsec(&entry, qlock, PWAIT|PCATCH, "thrsleep", - nsecs); - } - -out: - if (entry.tslp_ident != 0) - TAILQ_REMOVE(queue, &entry, tslp_link); - rw_exit_write(qlock); + sleep_setup(&entry, PWAIT|PCATCH, "thrsleep"); + error = sleep_finish(to_ticks, entry.tslp_p != NULL); + if (error != 0 || entry.tslp_p != NULL) { + mtx_enter(&tsb->tsb_lock); + if (entry.tslp_p != NULL) + TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link); + else + error = 0; + mtx_leave(&tsb->tsb_lock); - if (error == ERESTART) - error = ECANCELED; + if (error == ERESTART) + error = ECANCELED; + } return (error); +leave: + if (entry.tslp_p != NULL) { + mtx_enter(&tsb->tsb_lock); + if (entry.tslp_p != NULL) + TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link); + mtx_leave(&tsb->tsb_lock); + } + + return (error); } int @@ -747,6 +797,21 @@ sys___thrsleep(struct proc *p, void *v, return 0; } +static void +tslp_wakeups(struct tslpqueue *tslpq) +{ + struct tslpentry *entry, *nentry; + struct proc *p; + + SCHED_LOCK(); + TAILQ_FOREACH_SAFE(entry, tslpq, tslp_link, nentry) { + p = entry->tslp_p; + entry->tslp_p = NULL; + wakeup_proc(p, 0); + } + SCHED_UNLOCK(); +} + int sys___thrwakeup(struct proc *p, void *v, register_t *retval) { @@ -754,50 +819,53 @@ sys___thrwakeup(struct proc *p, void *v, syscallarg(const volatile void *) ident; syscallarg(int) n; } */ *uap = v; - struct tslpentry *entry, *tmp; - struct tslpqueue *queue; - struct rwlock *qlock; + struct tslpentry *entry, *nentry; + struct tslp_bucket *tsb; long ident = (long)SCARG(uap, ident); int n = SCARG(uap, n); int found = 0; + struct tslpqueue wq = TAILQ_HEAD_INITIALIZER(wq); - if (ident == 0) + if (ident == 0) { *retval = EINVAL; - else { - if (ident == -1) { - queue = &thrsleep_queue; - qlock = &thrsleep_lock; - /* - * Wake up all waiters with ident -1. This is needed - * because ident -1 can be shared by multiple userspace - * lock state machines concurrently. The implementation - * has no way to direct the wakeup to a particular - * state machine. - */ - n = 0; - } else { - queue = &p->p_p->ps_tslpqueue; - qlock = &p->p_p->ps_lock; - } + return (0); + } - rw_enter_write(qlock); - TAILQ_FOREACH_SAFE(entry, queue, tslp_link, tmp) { - if (entry->tslp_ident == ident) { - TAILQ_REMOVE(queue, entry, tslp_link); - entry->tslp_ident = 0; - wakeup_one(entry); - if (++found == n) - break; - } - } - rw_exit_write(qlock); + if (ident == -1) { + /* + * Wake up all waiters with ident -1. This is needed + * because ident -1 can be shared by multiple userspace + * lock state machines concurrently. The implementation + * has no way to direct the wakeup to a particular + * state machine. + */ + mtx_enter(&tsb_shared.tsb_lock); + tslp_wakeups(&tsb_shared.tsb_list); + TAILQ_INIT(&tsb_shared.tsb_list); + mtx_leave(&tsb_shared.tsb_lock); - if (ident == -1) - *retval = 0; - else - *retval = found ? 0 : ESRCH; + *retval = 0; + return (0); } + tsb = thrsleep_bucket(ident); + + mtx_enter(&tsb->tsb_lock); + TAILQ_FOREACH_SAFE(entry, &tsb->tsb_list, tslp_link, nentry) { + if (entry->tslp_ident == ident && entry->tslp_ps == p->p_p) { + TAILQ_REMOVE(&tsb->tsb_list, entry, tslp_link); + TAILQ_INSERT_TAIL(&wq, entry, tslp_link); + + if (++found == n) + break; + } + } + + if (found) + tslp_wakeups(&wq); + mtx_leave(&tsb->tsb_lock); + + *retval = found ? 0 : ESRCH; return (0); } Index: init_main.c =================================================================== RCS file: /cvs/src/sys/kern/init_main.c,v diff -u -p -r1.326 init_main.c --- init_main.c 2 Apr 2024 08:39:16 -0000 1.326 +++ init_main.c 25 Nov 2024 01:49:33 -0000 @@ -140,6 +140,7 @@ void db_ctf_init(void); void prof_init(void); void init_exec(void); void futex_init(void); +void tslp_init(void); void taskq_init(void); void timeout_proc_init(void); void pool_gc_pages(void *); @@ -250,6 +251,7 @@ main(void *framep) * Initialize futexes. */ futex_init(); + tslp_init(); /* Create credentials. */ p->p_ucred = crget();