Download raw body.
avoid contention in thrsleep/thrwakeup
turns out the __thrsleep and __thrwakeup syscalls largely coordinate
using a single lock per process. if you have heavily threaded code
building locks in userland out of thrsleep, this kernel lock gets
hammered. this is true even if userland thinks it's operating on
separate locks, it all ends up serialised in the kernel. this reduces
the throughput of these heavily threaded programs, and combined
with the pressure that rwlocks place on the scheduler, produces a lot of
time spinning on the sched lock in the kernel.
"but david", you say, "we don't use thrsleep these days, pthreads use
futexes". that is true, but it turns out go uses __thrsleep and
__thrwakeup as the backend for its locks. go programs also tend to use
threads more than the average program, so they can see a big benefit
from this diff. the reason i looked at this was someone was asking why
go builds had so much system time against them.
looking at thrsleep/thrwakeup, they remind me of futexes, so i
applied most of the tweaks ive proposed to futexes here. i now
believe that futexes are the product of linux NIH syndrome, but i
also believe futexes are simpler. long term i'd lean toward deprecating
thrsleep. until then we can make thrsleep go faster.
the big change is hashing thrsleep waiters into an different
locks/lists to try and avoid all locks in a process contending on
a single lock. the hash is shared by all processes though.
the diff also avoids having a waiter re-take the lock to avoid
contention on the thrwakeup code which is currently holding the lock.
tests are welcome. apart from go i don't know what really uses these
syscalls. ive seen some impressive improvements to the build times for
go itself, but the more tests the merrier.
Index: kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
diff -u -p -r1.210 kern_synch.c
--- kern_synch.c 4 Nov 2024 22:41:50 -0000 1.210
+++ kern_synch.c 25 Nov 2024 01:49:33 -0000
@@ -63,8 +63,6 @@
#endif
int sleep_signal_check(struct proc *);
-int thrsleep(struct proc *, struct sys___thrsleep_args *);
-int thrsleep_unlock(void *);
extern void proc_stop(struct proc *p, int);
@@ -611,28 +609,66 @@ sys_sched_yield(struct proc *p, void *v,
return (0);
}
-int
-thrsleep_unlock(void *lock)
+static inline int
+thrsleep_unlock(_atomic_lock_t *atomiclock)
{
static _atomic_lock_t unlocked = _ATOMIC_LOCK_UNLOCKED;
- _atomic_lock_t *atomiclock = lock;
- if (!lock)
+ if (atomiclock == NULL)
return 0;
return copyout(&unlocked, atomiclock, sizeof(unlocked));
}
struct tslpentry {
- TAILQ_ENTRY(tslpentry) tslp_link;
- long tslp_ident;
+ TAILQ_ENTRY(tslpentry) tslp_link;
+ struct process *tslp_ps;
+ long tslp_ident;
+ struct proc *volatile tslp_p;
};
+struct tslp_bucket {
+ struct tslpqueue tsb_list;
+ struct mutex tsb_lock;
+} __aligned(64);
+
/* thrsleep queue shared between processes */
-static struct tslpqueue thrsleep_queue = TAILQ_HEAD_INITIALIZER(thrsleep_queue);
-static struct rwlock thrsleep_lock = RWLOCK_INITIALIZER("thrsleeplk");
+static struct tslp_bucket tsb_shared;
-int
+#define TSLP_BUCKET_BITS 6
+#define TSLP_BUCKET_SIZE (1UL << TSLP_BUCKET_BITS)
+#define TSLP_BUCKET_MASK (TSLP_BUCKET_SIZE - 1)
+
+static struct tslp_bucket tsb_buckets[TSLP_BUCKET_SIZE];
+
+void
+tslp_init(void)
+{
+ struct tslp_bucket *tsb;
+ size_t i;
+
+ TAILQ_INIT(&tsb_shared.tsb_list);
+ mtx_init(&tsb_shared.tsb_lock, IPL_MPFLOOR);
+
+ for (i = 0; i < nitems(tsb_buckets); i++) {
+ tsb = &tsb_buckets[i];
+
+ TAILQ_INIT(&tsb->tsb_list);
+ mtx_init(&tsb->tsb_lock, IPL_MPFLOOR);
+ }
+}
+
+static struct tslp_bucket *
+thrsleep_bucket(long ident)
+{
+ ident >>= 3;
+ ident ^= ident >> TSLP_BUCKET_BITS;
+ ident &= TSLP_BUCKET_MASK;
+
+ return &tsb_buckets[ident];
+}
+
+static int
thrsleep(struct proc *p, struct sys___thrsleep_args *v)
{
struct sys___thrsleep_args /* {
@@ -644,18 +680,19 @@ thrsleep(struct proc *p, struct sys___th
} */ *uap = v;
long ident = (long)SCARG(uap, ident);
struct tslpentry entry;
- struct tslpqueue *queue;
- struct rwlock *qlock;
+ struct tslp_bucket *tsb;
struct timespec *tsp = (struct timespec *)SCARG(uap, tp);
void *lock = SCARG(uap, lock);
- uint64_t nsecs = INFSLP;
- int abort = 0, error;
+ const uint32_t *abortp = SCARG(uap, abort);
clockid_t clock_id = SCARG(uap, clock_id);
+ uint64_t to_ticks = 0;
+ int error = 0;
if (ident == 0)
return (EINVAL);
if (tsp != NULL) {
struct timespec now;
+ uint64_t nsecs;
if ((error = clock_gettime(p, clock_id, &now)))
return (error);
@@ -673,49 +710,62 @@ thrsleep(struct proc *p, struct sys___th
timespecsub(tsp, &now, tsp);
nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP);
+ to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
+ if (to_ticks > INT_MAX)
+ to_ticks = INT_MAX;
}
- if (ident == -1) {
- queue = &thrsleep_queue;
- qlock = &thrsleep_lock;
- } else {
- queue = &p->p_p->ps_tslpqueue;
- qlock = &p->p_p->ps_lock;
- }
+ tsb = (ident == -1) ? &tsb_shared : thrsleep_bucket(ident);
/* Interlock with wakeup. */
+ entry.tslp_ps = p->p_p;
entry.tslp_ident = ident;
- rw_enter_write(qlock);
- TAILQ_INSERT_TAIL(queue, &entry, tslp_link);
- rw_exit_write(qlock);
+ entry.tslp_p = p;
+
+ mtx_enter(&tsb->tsb_lock);
+ TAILQ_INSERT_TAIL(&tsb->tsb_list, &entry, tslp_link);
+ mtx_leave(&tsb->tsb_lock);
error = thrsleep_unlock(lock);
+ if (error != 0)
+ goto leave;
- if (error == 0 && SCARG(uap, abort) != NULL)
- error = copyin(SCARG(uap, abort), &abort, sizeof(abort));
+ if (abortp != NULL) {
+ uint32_t abort;
+ error = copyin32(abortp, &abort);
+ if (error != 0)
+ goto leave;
+ if (abort) {
+ error = EINTR;
+ goto leave;
+ }
+ }
- rw_enter_write(qlock);
- if (error != 0)
- goto out;
- if (abort != 0) {
- error = EINTR;
- goto out;
- }
- if (entry.tslp_ident != 0) {
- error = rwsleep_nsec(&entry, qlock, PWAIT|PCATCH, "thrsleep",
- nsecs);
- }
-
-out:
- if (entry.tslp_ident != 0)
- TAILQ_REMOVE(queue, &entry, tslp_link);
- rw_exit_write(qlock);
+ sleep_setup(&entry, PWAIT|PCATCH, "thrsleep");
+ error = sleep_finish(to_ticks, entry.tslp_p != NULL);
+ if (error != 0 || entry.tslp_p != NULL) {
+ mtx_enter(&tsb->tsb_lock);
+ if (entry.tslp_p != NULL)
+ TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link);
+ else
+ error = 0;
+ mtx_leave(&tsb->tsb_lock);
- if (error == ERESTART)
- error = ECANCELED;
+ if (error == ERESTART)
+ error = ECANCELED;
+ }
return (error);
+leave:
+ if (entry.tslp_p != NULL) {
+ mtx_enter(&tsb->tsb_lock);
+ if (entry.tslp_p != NULL)
+ TAILQ_REMOVE(&tsb->tsb_list, &entry, tslp_link);
+ mtx_leave(&tsb->tsb_lock);
+ }
+
+ return (error);
}
int
@@ -747,6 +797,21 @@ sys___thrsleep(struct proc *p, void *v,
return 0;
}
+static void
+tslp_wakeups(struct tslpqueue *tslpq)
+{
+ struct tslpentry *entry, *nentry;
+ struct proc *p;
+
+ SCHED_LOCK();
+ TAILQ_FOREACH_SAFE(entry, tslpq, tslp_link, nentry) {
+ p = entry->tslp_p;
+ entry->tslp_p = NULL;
+ wakeup_proc(p, 0);
+ }
+ SCHED_UNLOCK();
+}
+
int
sys___thrwakeup(struct proc *p, void *v, register_t *retval)
{
@@ -754,50 +819,53 @@ sys___thrwakeup(struct proc *p, void *v,
syscallarg(const volatile void *) ident;
syscallarg(int) n;
} */ *uap = v;
- struct tslpentry *entry, *tmp;
- struct tslpqueue *queue;
- struct rwlock *qlock;
+ struct tslpentry *entry, *nentry;
+ struct tslp_bucket *tsb;
long ident = (long)SCARG(uap, ident);
int n = SCARG(uap, n);
int found = 0;
+ struct tslpqueue wq = TAILQ_HEAD_INITIALIZER(wq);
- if (ident == 0)
+ if (ident == 0) {
*retval = EINVAL;
- else {
- if (ident == -1) {
- queue = &thrsleep_queue;
- qlock = &thrsleep_lock;
- /*
- * Wake up all waiters with ident -1. This is needed
- * because ident -1 can be shared by multiple userspace
- * lock state machines concurrently. The implementation
- * has no way to direct the wakeup to a particular
- * state machine.
- */
- n = 0;
- } else {
- queue = &p->p_p->ps_tslpqueue;
- qlock = &p->p_p->ps_lock;
- }
+ return (0);
+ }
- rw_enter_write(qlock);
- TAILQ_FOREACH_SAFE(entry, queue, tslp_link, tmp) {
- if (entry->tslp_ident == ident) {
- TAILQ_REMOVE(queue, entry, tslp_link);
- entry->tslp_ident = 0;
- wakeup_one(entry);
- if (++found == n)
- break;
- }
- }
- rw_exit_write(qlock);
+ if (ident == -1) {
+ /*
+ * Wake up all waiters with ident -1. This is needed
+ * because ident -1 can be shared by multiple userspace
+ * lock state machines concurrently. The implementation
+ * has no way to direct the wakeup to a particular
+ * state machine.
+ */
+ mtx_enter(&tsb_shared.tsb_lock);
+ tslp_wakeups(&tsb_shared.tsb_list);
+ TAILQ_INIT(&tsb_shared.tsb_list);
+ mtx_leave(&tsb_shared.tsb_lock);
- if (ident == -1)
- *retval = 0;
- else
- *retval = found ? 0 : ESRCH;
+ *retval = 0;
+ return (0);
}
+ tsb = thrsleep_bucket(ident);
+
+ mtx_enter(&tsb->tsb_lock);
+ TAILQ_FOREACH_SAFE(entry, &tsb->tsb_list, tslp_link, nentry) {
+ if (entry->tslp_ident == ident && entry->tslp_ps == p->p_p) {
+ TAILQ_REMOVE(&tsb->tsb_list, entry, tslp_link);
+ TAILQ_INSERT_TAIL(&wq, entry, tslp_link);
+
+ if (++found == n)
+ break;
+ }
+ }
+
+ if (found)
+ tslp_wakeups(&wq);
+ mtx_leave(&tsb->tsb_lock);
+
+ *retval = found ? 0 : ESRCH;
return (0);
}
Index: init_main.c
===================================================================
RCS file: /cvs/src/sys/kern/init_main.c,v
diff -u -p -r1.326 init_main.c
--- init_main.c 2 Apr 2024 08:39:16 -0000 1.326
+++ init_main.c 25 Nov 2024 01:49:33 -0000
@@ -140,6 +140,7 @@ void db_ctf_init(void);
void prof_init(void);
void init_exec(void);
void futex_init(void);
+void tslp_init(void);
void taskq_init(void);
void timeout_proc_init(void);
void pool_gc_pages(void *);
@@ -250,6 +251,7 @@ main(void *framep)
* Initialize futexes.
*/
futex_init();
+ tslp_init();
/* Create credentials. */
p->p_ucred = crget();
avoid contention in thrsleep/thrwakeup