From: David Gwynne Subject: Re: timing lld --threads for fun and profit To: Landry Breuil , robert@openbsd.org, tech@openbsd.org Date: Fri, 15 Nov 2024 19:40:47 +1000 On Thu, Nov 14, 2024 at 12:05:16PM +0100, Martin Pieuchot wrote: > On 08/11/24(Fri) 13:01, Stuart Henderson wrote: > > On 2024/11/08 13:26, Martin Pieuchot wrote: > > > On 08/11/24(Fri) 12:22, Landry Breuil wrote: > > > > [...] > > > > someone(tm) should look into patching lld to avoid using more than > > > > MAX(ncpu,5) threads ? In the meantime, i'll probably fix the firefox > > > > ports to avoir using MAKE_JOBS for lld but cap it at 5. > > > > > > Recent lld(1) include the following commits which limit the value to > > > 16 by default instead of the number of available CPUs. > > > > > > See the following commits: > > > > > > https://github.com/llvm/llvm-project/commit/a8788de1c3f3c8c3a591bd3aae2acee1b43b229a > > > https://github.com/llvm/llvm-project/commit/da68d2164efcc1f5e57f090e2ae2219056b120a0 > > > > > > Robert do you see the same with chromium? Would it make sense to > > > backport these diff with a smaller value for OpenBSD? > > > > > > > Certainly helps for reorder_kernel. > > It also helps for any configure script. > > The one from wget I'm currently playing with is 2 times slower on a > 24CPUs machine with GENERIC.MP than with GENERIC. > > FlameGraph attached corresponds to non-idle time spent in kernel during > the execution of the wget configure script. ~20% of sys time is spent > in sys_futex/rw_lock contention which correspond lld(1) threads spinning. i have a futex diff that might help here. standing back from it a bit, it largely reimplelemnts the kernel sleep queues, but for threads sleeping on userland addresses. there's 3 main changes in the diff: 1. the list of waiters is split up into a hash with individual locks for each bucket. threads are hashed into a bucket based on the userland address they're dealing with. this mitigates against futex syscalls contending on the single futex lock. i even padded the list heads (and their locks) so they should sit on different cache lines. 2. link the threads into the bucket directly currently a futex struct is allocated to represent the address in userland threads are waiting on. threads then are linked off this futex struct. this means wake/requeue just has to find the futex struct, and then iterate on the threads hanging off it calling wakeup. however, allocating this struct isn't free. the change is to avoid calling pool_get and pool_put all the time. the only real downside is that requeue has to do more work because now it has to find threads and update the address they're sleeping on. requeue is rare compared to wait/wake though, so i leant toward speeding wake/wait up as much as possible. the amount of iteration is limited. having the hash of lists goes some way toward restricting the length of the lists. i moved to using wakeup_proc instead of wakeup to avoid having to iterate over lists of procs in the sleep queue too. 3. have threads use PNORELOCK when they sleep it's very frustrating watching a thread wake up and then immediately go back to sleep in the rw_enter on the way out of rwsleep because the thread that woke it up is still holding the rwlock. the new code skips this by having the wake/requeue code do as much work as it can on behalf of the sleeping thread before waking it up. if everything goes well, then there's nothing for the waiting thread to do when it comes out of sleep except return to userland. Index: sys/futex.h =================================================================== RCS file: /cvs/src/sys/sys/futex.h,v diff -u -p -r1.2 futex.h --- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2 +++ sys/futex.h 15 Nov 2024 09:16:38 -0000 @@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int, __END_DECLS #endif /* ! _KERNEL */ +#define FUTEX_OP_MASK 0x007f + #define FUTEX_WAIT 1 #define FUTEX_WAKE 2 #define FUTEX_REQUEUE 3 -#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_FLAG_MASK 0xff80 + +#define FUTEX_PRIVATE_FLAG 0x0080 #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) Index: kern/sys_futex.c =================================================================== RCS file: /cvs/src/sys/kern/sys_futex.c,v diff -u -p -r1.22 sys_futex.c --- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22 +++ kern/sys_futex.c 15 Nov 2024 09:16:38 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include /* tick_nsec */ #include #ifdef KTRACE @@ -36,41 +37,65 @@ * Kernel representation of a futex. */ struct futex { - LIST_ENTRY(futex) ft_list; /* list of all futexes */ - TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ + TAILQ_ENTRY(futex) ft_entry; /* list of all futexes */ + struct process *ft_ps; struct uvm_object *ft_obj; /* UVM object */ struct vm_amap *ft_amap; /* UVM amap */ voff_t ft_off; /* UVM offset */ - unsigned int ft_refcnt; /* # of references */ + + struct proc * volatile ft_proc; }; -/* Syscall helpers. */ -int futex_wait(uint32_t *, uint32_t, const struct timespec *, int); -int futex_wake(uint32_t *, uint32_t, int); -int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int); - -/* Flags for futex_get(). */ -#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ -#define FT_PRIVATE 0x2 /* Futex is process-private. */ +static int +futex_is_eq(const struct futex *a, const struct futex *b) +{ + return (a->ft_off == b->ft_off && + a->ft_ps == b->ft_ps && + a->ft_obj == b->ft_obj && + a->ft_amap == b->ft_amap); +} -struct futex *futex_get(uint32_t *, int); -void futex_put(struct futex *); +TAILQ_HEAD(futexen, futex); -/* - * The global futex lock serializes futex(2) calls so that no wakeup - * event is lost, and protects all futex lists and futex states. - */ -struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); -static struct futex_list ftlist_shared = - LIST_HEAD_INITIALIZER(ftlist_shared); -struct pool ftpool; +struct futex_bucket { + struct futexen fb_list; + struct rwlock fb_lock; + uint32_t fb_id; /* for lock ordering */ +} __aligned(64); +/* Syscall helpers. */ +static int futex_wait(struct proc *, uint32_t *, uint32_t, + const struct timespec *, int); +static int futex_wake(struct proc *, uint32_t *, uint32_t, int, + register_t *); +static int futex_requeue(struct proc *, uint32_t *, uint32_t, + uint32_t *, uint32_t, int, register_t *); + +/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */ +#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */ + +#define FUTEX_BUCKET_BITS 6 +#define FUTEX_BUCKET_SIZE (1U << FUTEX_BUCKET_BITS) +#define FUTEX_BUCKET_MASK (FUTEX_BUCKET_SIZE - 1) + +static struct futex_bucket futex_hash[FUTEX_BUCKET_SIZE]; void futex_init(void) { - pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, - PR_WAITOK | PR_RWLOCK, "futexpl", NULL); + struct futex_bucket *fb; + unsigned int i; + + for (i = 0; i < nitems(futex_hash); i++) { + fb = &futex_hash[i]; + + TAILQ_INIT(&fb->fb_list); + rw_init(&fb->fb_lock, "futexlk"); + + fb->fb_id = arc4random(); + fb->fb_id &= ~FUTEX_BUCKET_MASK; + fb->fb_id |= i; + } } int @@ -88,65 +113,51 @@ sys_futex(struct proc *p, void *v, regis uint32_t val = SCARG(uap, val); const struct timespec *timeout = SCARG(uap, timeout); void *g = SCARG(uap, g); - int flags = 0; + int flags = op & FUTEX_FLAG_MASK; int error = 0; - if (op & FUTEX_PRIVATE_FLAG) - flags |= FT_PRIVATE; - - rw_enter_write(&ftlock); - switch (op) { + switch (op & FUTEX_OP_MASK) { case FUTEX_WAIT: - case FUTEX_WAIT_PRIVATE: - error = futex_wait(uaddr, val, timeout, flags); + error = futex_wait(p, uaddr, val, timeout, flags); break; case FUTEX_WAKE: - case FUTEX_WAKE_PRIVATE: - *retval = futex_wake(uaddr, val, flags); + error = futex_wake(p, uaddr, val, flags, retval); break; case FUTEX_REQUEUE: - case FUTEX_REQUEUE_PRIVATE: - *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags); + error = futex_requeue(p, uaddr, val, g, + (u_long)timeout, flags, retval); break; default: error = ENOSYS; break; } - rw_exit_write(&ftlock); return error; } -/* - * Return an existing futex matching userspace address ``uaddr''. - * - * If such futex does not exist and FT_CREATE is given, create it. - */ -struct futex * -futex_get(uint32_t *uaddr, int flags) +static void +futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags) { - struct proc *p = curproc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; struct uvm_object *obj = NULL; struct vm_amap *amap = NULL; voff_t off = (vaddr_t)uaddr; - struct futex *f; - struct futex_list *ftlist = &p->p_p->ps_ftlist; + struct process *ps; - rw_assert_wrlock(&ftlock); + if (ISSET(flags, FT_PRIVATE)) + ps = p->p_p; + else { + ps = NULL; - if (!(flags & FT_PRIVATE)) { vm_map_lock_read(map); if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) && entry->inheritance == MAP_INHERIT_SHARE) { if (UVM_ET_ISOBJ(entry)) { - ftlist = &ftlist_shared; obj = entry->object.uvm_obj; off = entry->offset + ((vaddr_t)uaddr - entry->start); } else if (entry->aref.ar_amap) { - ftlist = &ftlist_shared; amap = entry->aref.ar_amap; off = ptoa(entry->aref.ar_pageoff) + ((vaddr_t)uaddr - entry->start); @@ -155,47 +166,19 @@ futex_get(uint32_t *uaddr, int flags) vm_map_unlock_read(map); } - LIST_FOREACH(f, ftlist, ft_list) { - if (f->ft_obj == obj && f->ft_amap == amap && - f->ft_off == off) { - f->ft_refcnt++; - break; - } - } - - if ((f == NULL) && (flags & FT_CREATE)) { - /* - * We rely on the rwlock to ensure that no other thread - * create the same futex. - */ - f = pool_get(&ftpool, PR_WAITOK); - TAILQ_INIT(&f->ft_threads); - f->ft_obj = obj; - f->ft_amap = amap; - f->ft_off = off; - f->ft_refcnt = 1; - LIST_INSERT_HEAD(ftlist, f, ft_list); - } - - return f; + f->ft_ps = ps; + f->ft_obj = obj; + f->ft_amap = amap; + f->ft_off = off; } -/* - * Release a given futex. - */ -void -futex_put(struct futex *f) +static inline struct futex_bucket * +futex_get_bucket(struct futex *f) { - rw_assert_wrlock(&ftlock); - - KASSERT(f->ft_refcnt > 0); + uint32_t key = f->ft_off >> 3; /* watevs */ + key ^= key >> FUTEX_BUCKET_BITS; - --f->ft_refcnt; - if (f->ft_refcnt == 0) { - KASSERT(TAILQ_EMPTY(&f->ft_threads)); - LIST_REMOVE(f, ft_list); - pool_put(&ftpool, f); - } + return (&futex_hash[key & FUTEX_BUCKET_MASK]); } /* @@ -203,32 +186,16 @@ futex_put(struct futex *f) * ``uaddr''. Let it sleep for the specified ``timeout'' time, or * indefinitely if the argument is NULL. */ -int -futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout, - int flags) +static int +futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val, + const struct timespec *timeout, int flags) { - struct proc *p = curproc; - struct futex *f; + struct futex f; + struct futex_bucket *fb; uint64_t nsecs = INFSLP; uint32_t cval; int error; - /* - * After reading the value a race is still possible but - * we deal with it by serializing all futex syscalls. - */ - rw_assert_wrlock(&ftlock); - - /* - * Read user space futex value - */ - if ((error = copyin32(uaddr, &cval))) - return error; - - /* If the value changed, stop here. */ - if (cval != val) - return EAGAIN; - if (timeout != NULL) { struct timespec ts; @@ -243,29 +210,79 @@ futex_wait(uint32_t *uaddr, uint32_t val nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP)); } - f = futex_get(uaddr, flags | FT_CREATE); - TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); - p->p_futex = f; - - error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs); - if (error == ERESTART) - error = ECANCELED; - else if (error == EWOULDBLOCK) { - /* A race occurred between a wakeup and a timeout. */ - if (p->p_futex == NULL) - error = 0; - else - error = ETIMEDOUT; + futex_addrs(p, &f, uaddr, flags); + fb = futex_get_bucket(&f); + + /* + * After reading the value a race is still possible but + * we deal with it by serializing futex syscalls. + */ + error = rw_enter(&fb->fb_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + /* + * Read user space futex value + */ + if ((error = copyin32(uaddr, &cval)) != 0) + goto exit; + + /* If the value changed, stop here. */ + if (cval != val) { + error = EAGAIN; + goto exit; } + f.ft_proc = p; + TAILQ_INSERT_TAIL(&fb->fb_list, &f, ft_entry); + + error = rwsleep_nsec(&f, &fb->fb_lock, + PWAIT|PCATCH|PNORELOCK, "fsleep", nsecs); /* Remove ourself if we haven't been awaken. */ - if ((f = p->p_futex) != NULL) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); + if (error != 0 || f.ft_proc != NULL) { + rw_enter_write(&fb->fb_lock); + if (f.ft_proc != NULL) + TAILQ_REMOVE(&fb->fb_list, &f, ft_entry); + else + error = 0; + rw_exit_write(&fb->fb_lock); + + switch (error) { + case ERESTART: + error = ECANCELED; + break; + case EWOULDBLOCK: + error = ETIMEDOUT; + break; + } } return error; +exit: + rw_exit_write(&fb->fb_lock); + return error; +} + +static void +futexen_wakeup(struct futexen *fl) +{ + struct futex *f, *nf; + struct proc *p; + + /* + * take care to avoid referencing f after we set ft_proc + * to NULL (and wake the associated thread up). f is on the + * stack of the thread we're trying let out of the kernel, + * so it can go away. + */ + + SCHED_LOCK(); + TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) { + p = f->ft_proc; + f->ft_proc = NULL; + wakeup_proc(p, 0); + } + SCHED_UNLOCK(); } /* @@ -273,46 +290,135 @@ futex_wait(uint32_t *uaddr, uint32_t val * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at * address ``uaddr2''. */ -int -futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m, - int flags) +static int +futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n, + uint32_t *uaddr2, uint32_t m, int flags, register_t *retval) { - struct futex *f, *g; - struct proc *p; + struct futexen fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex okey, nkey; + struct futex *f, *nf, *mf = NULL; + struct futex_bucket *ofb, *nfb; uint32_t count = 0; - rw_assert_wrlock(&ftlock); + if (m == 0) + return futex_wake(p, uaddr, n, flags, retval); - f = futex_get(uaddr, flags); - if (f == NULL) - return 0; + futex_addrs(p, &okey, uaddr, flags); + ofb = futex_get_bucket(&okey); + futex_addrs(p, &nkey, uaddr2, flags); + nfb = futex_get_bucket(&nkey); + + if (ofb->fb_id < nfb->fb_id) { + rw_enter_write(&ofb->fb_lock); + rw_enter_write(&nfb->fb_lock); + } else if (ofb->fb_id > nfb->fb_id) { + rw_enter_write(&nfb->fb_lock); + rw_enter_write(&ofb->fb_lock); + } else + rw_enter_write(&ofb->fb_lock); + + TAILQ_FOREACH_SAFE(f, &ofb->fb_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &okey)) + continue; - while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); - - if (count < n) { - wakeup_one(p); - } else if (uaddr2 != NULL) { - g = futex_get(uaddr2, FT_CREATE); - TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); - p->p_futex = g; + TAILQ_REMOVE(&ofb->fb_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) { + mf = nf; + break; } - count++; } - futex_put(f); + if (!TAILQ_EMPTY(&fl)) + futexen_wakeup(&fl); + + /* update matching futexes */ + if (mf != NULL) { + /* + * only iterate from the current entry to the tail + * of the list as it is now in case we're requeueing + * on the end of the same list. + */ + nf = TAILQ_LAST(&ofb->fb_list, futexen); + do { + f = mf; + mf = TAILQ_NEXT(f, ft_entry); + /* __builtin_prefetch(mf, 1); */ + + if (!futex_is_eq(f, &okey)) + continue; + + TAILQ_REMOVE(&ofb->fb_list, f, ft_entry); + /* it should only be ft_off that changes, but eh */ + f->ft_ps = nkey.ft_ps; + f->ft_obj = nkey.ft_obj; + f->ft_amap = nkey.ft_amap; + f->ft_off = nkey.ft_off; + TAILQ_INSERT_TAIL(&nfb->fb_list, f, ft_entry); + + if (--m == 0) + break; + } while (f != nf); + } + + if (ofb->fb_id != nfb->fb_id) + rw_exit_write(&nfb->fb_lock); + rw_exit_write(&ofb->fb_lock); - return count; + *retval = count; + return 0; } /* * Wakeup at most ``n'' sibling threads sleeping on a futex at address * ``uaddr''. */ -int -futex_wake(uint32_t *uaddr, uint32_t n, int flags) +static int +futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags, + register_t *retval) { - return futex_requeue(uaddr, n, NULL, 0, flags); + struct futexen fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex key; + struct futex *f, *nf; + struct futex_bucket *fb; + int count = 0; + int error; + + if (n == 0) { + *retval = 0; + return 0; + } + + futex_addrs(p, &key, uaddr, flags); + fb = futex_get_bucket(&key); + + error = rw_enter(&fb->fb_lock, RW_WRITE|RW_INTR); + if (error != 0) + return error; + + TAILQ_FOREACH_SAFE(f, &fb->fb_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &key)) + continue; + + TAILQ_REMOVE(&fb->fb_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) + break; + } + + if (!TAILQ_EMPTY(&fl)) + futexen_wakeup(&fl); + + rw_exit_write(&fb->fb_lock); + + *retval = count; + return 0; }