From: Martin Pieuchot Subject: Re: avoid lock contention in futex syscalls To: David Gwynne Cc: tech@openbsd.org Date: Fri, 2 May 2025 12:33:51 +0200 On 02/05/25(Fri) 12:24, David Gwynne wrote: > On Fri, May 02, 2025 at 10:10:42AM +1000, David Gwynne wrote: > > > > if i get time at work i'll try and update the code today. > > i think this addresses most of the stuff mpi@ asked about. > > renaming struct futexen to futex_list collided with the futex bits in > proc.h, so i removed them. they're unecessary now anyway. One tweak, we try to keep capital letter [F] in locking comments for global locks. Since here it is a per-sleepqueue lock, I'd suggest using [f]. Ok with me. > Index: sys/futex.h > =================================================================== > RCS file: /cvs/src/sys/sys/futex.h,v > diff -u -p -r1.2 futex.h > --- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2 > +++ sys/futex.h 2 May 2025 02:19:46 -0000 > @@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int, > __END_DECLS > #endif /* ! _KERNEL */ > > +#define FUTEX_OP_MASK 0x007f > + > #define FUTEX_WAIT 1 > #define FUTEX_WAKE 2 > #define FUTEX_REQUEUE 3 > > -#define FUTEX_PRIVATE_FLAG 128 > +#define FUTEX_FLAG_MASK 0xff80 > + > +#define FUTEX_PRIVATE_FLAG 0x0080 > > #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) > #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) > Index: sys/proc.h > =================================================================== > RCS file: /cvs/src/sys/sys/proc.h,v > diff -u -p -r1.386 proc.h > --- sys/proc.h 1 May 2025 01:16:42 -0000 1.386 > +++ sys/proc.h 2 May 2025 02:19:47 -0000 > @@ -116,8 +125,6 @@ struct tusage { > * run-time information needed by threads. > */ > #ifdef __need_process > -struct futex; > -LIST_HEAD(futex_list, futex); > struct proc; > struct tslpentry; > TAILQ_HEAD(tslpqueue, tslpentry); > @@ -178,7 +185,6 @@ struct process { > struct vmspace *ps_vmspace; /* Address space */ > pid_t ps_pid; /* [I] Process identifier. */ > > - struct futex_list ps_ftlist; /* futexes attached to this process */ > struct tslpqueue ps_tslpqueue; /* [p] queue of threads in thrsleep */ > struct rwlock ps_lock; /* per-process rwlock */ > struct mutex ps_mtx; /* per-process mutex */ > @@ -343,9 +349,6 @@ struct proc { > > struct process *p_p; /* [I] The process of this thread. */ > TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */ > - > - TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */ > - struct futex *p_futex; /* Current sleeping futex. */ > > /* substructures: */ > struct filedesc *p_fd; /* copy of p_p->ps_fd */ > Index: kern/kern_fork.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_fork.c,v > diff -u -p -r1.270 kern_fork.c > --- kern/kern_fork.c 14 Apr 2025 09:15:24 -0000 1.270 > +++ kern/kern_fork.c 2 May 2025 02:19:47 -0000 > @@ -196,7 +196,6 @@ process_initialize(struct process *pr, s > > LIST_INIT(&pr->ps_children); > LIST_INIT(&pr->ps_orphans); > - LIST_INIT(&pr->ps_ftlist); > LIST_INIT(&pr->ps_sigiolst); > TAILQ_INIT(&pr->ps_tslpqueue); > > Index: kern/sys_futex.c > =================================================================== > RCS file: /cvs/src/sys/kern/sys_futex.c,v > diff -u -p -r1.22 sys_futex.c > --- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22 > +++ kern/sys_futex.c 2 May 2025 02:19:47 -0000 > @@ -24,6 +24,8 @@ > #include > #include > #include > +#include /* CACHELINESIZE */ > +#include /* tick_nsec */ > #include > > #ifdef KTRACE > @@ -33,44 +35,98 @@ > #include > > /* > + * Locks used to protect variables in this file: > + * > + * I immutable after initialization > + * F futex_slpque fsq_lock > + */ > + > +/* > * Kernel representation of a futex. > + * > + * The userland address that the futex is waiting on is represented by > + * ft_ps, ft_obj, ft_amap, and ft_off. > + * > + * Whether the futex is waiting or woken up is represented by the > + * ft_proc pointer being set (ie, not NULL) or not (ie, NULL) respectively. > + * When the futex is waiting it is referenced by the list in a > + * futex_slpque. When the futex gets woken up, it is removed from the > + * list and the ft_proc pointer is cleared to indicate that the reference > + * held by the list has been released. Only a thread holding the lock > + * may remove the futex from the list and clear ft_proc. This is true > + * even for futex_wait(). > + * > + * However, futex_wait() may read ft_proc without the lock so it can > + * avoid contending with the thread that just woke it up. This means > + * that once ft_proc is cleared, futex_wait() may return, the struct > + * futex will no longer exist, and it is no longer safe to access it > + * from the wakeup side. > + * > + * tl;dr: the thread holding the slpque lock "owns" the references > + * to the futexes on the list until it clears ft_proc. > */ > + > struct futex { > - LIST_ENTRY(futex) ft_list; /* list of all futexes */ > - TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ > - struct uvm_object *ft_obj; /* UVM object */ > - struct vm_amap *ft_amap; /* UVM amap */ > - voff_t ft_off; /* UVM offset */ > - unsigned int ft_refcnt; /* # of references */ > + TAILQ_ENTRY(futex) ft_entry; /* [F] entry on futex_slpque */ > + > + struct process *ft_ps; /* [I] for private futexes */ > + struct uvm_object *ft_obj; /* [F] UVM object */ > + struct vm_amap *ft_amap; /* [F] UVM amap */ > + volatile voff_t ft_off; /* [F] UVM offset */ > + > + struct proc * volatile ft_proc; /* [F] waiting thread */ > }; > > -/* Syscall helpers. */ > -int futex_wait(uint32_t *, uint32_t, const struct timespec *, int); > -int futex_wake(uint32_t *, uint32_t, int); > -int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int); > - > -/* Flags for futex_get(). */ > -#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ > -#define FT_PRIVATE 0x2 /* Futex is process-private. */ > +static int > +futex_is_eq(const struct futex *a, const struct futex *b) > +{ > + return (a->ft_off == b->ft_off && > + a->ft_ps == b->ft_ps && > + a->ft_obj == b->ft_obj && > + a->ft_amap == b->ft_amap); > +} > > -struct futex *futex_get(uint32_t *, int); > -void futex_put(struct futex *); > +TAILQ_HEAD(futex_list, futex); > > -/* > - * The global futex lock serializes futex(2) calls so that no wakeup > - * event is lost, and protects all futex lists and futex states. > - */ > -struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); > -static struct futex_list ftlist_shared = > - LIST_HEAD_INITIALIZER(ftlist_shared); > -struct pool ftpool; > +struct futex_slpque { > + struct futex_list fsq_list; /* [F] */ > + struct rwlock fsq_lock; > + uint32_t fsq_id; /* [I] for lock ordering */ > +} __aligned(CACHELINESIZE); > + > +/* Syscall helpers. */ > +static int futex_wait(struct proc *, uint32_t *, uint32_t, > + const struct timespec *, int); > +static int futex_wake(struct proc *, uint32_t *, uint32_t, int, > + register_t *); > +static int futex_requeue(struct proc *, uint32_t *, uint32_t, > + uint32_t *, uint32_t, int, register_t *); > + > +/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */ > +#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */ > + > +#define FUTEX_SLPQUES_BITS 6 > +#define FUTEX_SLPQUES_SIZE (1U << FUTEX_SLPQUES_BITS) > +#define FUTEX_SLPQUES_MASK (FUTEX_SLPQUES_SIZE - 1) > > +static struct futex_slpque futex_slpques[FUTEX_SLPQUES_SIZE]; > > void > futex_init(void) > { > - pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, > - PR_WAITOK | PR_RWLOCK, "futexpl", NULL); > + struct futex_slpque *fsq; > + unsigned int i; > + > + for (i = 0; i < nitems(futex_slpques); i++) { > + fsq = &futex_slpques[i]; > + > + TAILQ_INIT(&fsq->fsq_list); > + rw_init(&fsq->fsq_lock, "futexlk"); > + > + fsq->fsq_id = arc4random(); > + fsq->fsq_id &= ~FUTEX_SLPQUES_MASK; > + fsq->fsq_id |= i; > + } > } > > int > @@ -88,65 +144,51 @@ sys_futex(struct proc *p, void *v, regis > uint32_t val = SCARG(uap, val); > const struct timespec *timeout = SCARG(uap, timeout); > void *g = SCARG(uap, g); > - int flags = 0; > + int flags = op & FUTEX_FLAG_MASK; > int error = 0; > > - if (op & FUTEX_PRIVATE_FLAG) > - flags |= FT_PRIVATE; > - > - rw_enter_write(&ftlock); > - switch (op) { > + switch (op & FUTEX_OP_MASK) { > case FUTEX_WAIT: > - case FUTEX_WAIT_PRIVATE: > - error = futex_wait(uaddr, val, timeout, flags); > + error = futex_wait(p, uaddr, val, timeout, flags); > break; > case FUTEX_WAKE: > - case FUTEX_WAKE_PRIVATE: > - *retval = futex_wake(uaddr, val, flags); > + error = futex_wake(p, uaddr, val, flags, retval); > break; > case FUTEX_REQUEUE: > - case FUTEX_REQUEUE_PRIVATE: > - *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags); > + error = futex_requeue(p, uaddr, val, g, > + (u_long)timeout, flags, retval); > break; > default: > error = ENOSYS; > break; > } > - rw_exit_write(&ftlock); > > return error; > } > > -/* > - * Return an existing futex matching userspace address ``uaddr''. > - * > - * If such futex does not exist and FT_CREATE is given, create it. > - */ > -struct futex * > -futex_get(uint32_t *uaddr, int flags) > +static void > +futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags) > { > - struct proc *p = curproc; > vm_map_t map = &p->p_vmspace->vm_map; > vm_map_entry_t entry; > struct uvm_object *obj = NULL; > struct vm_amap *amap = NULL; > voff_t off = (vaddr_t)uaddr; > - struct futex *f; > - struct futex_list *ftlist = &p->p_p->ps_ftlist; > + struct process *ps; > > - rw_assert_wrlock(&ftlock); > + if (ISSET(flags, FT_PRIVATE)) > + ps = p->p_p; > + else { > + ps = NULL; > > - if (!(flags & FT_PRIVATE)) { > vm_map_lock_read(map); > if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) && > entry->inheritance == MAP_INHERIT_SHARE) { > if (UVM_ET_ISOBJ(entry)) { > - ftlist = &ftlist_shared; > obj = entry->object.uvm_obj; > off = entry->offset + > ((vaddr_t)uaddr - entry->start); > } else if (entry->aref.ar_amap) { > - ftlist = &ftlist_shared; > amap = entry->aref.ar_amap; > off = ptoa(entry->aref.ar_pageoff) + > ((vaddr_t)uaddr - entry->start); > @@ -155,47 +197,47 @@ futex_get(uint32_t *uaddr, int flags) > vm_map_unlock_read(map); > } > > - LIST_FOREACH(f, ftlist, ft_list) { > - if (f->ft_obj == obj && f->ft_amap == amap && > - f->ft_off == off) { > - f->ft_refcnt++; > - break; > - } > - } > + f->ft_ps = ps; > + f->ft_obj = obj; > + f->ft_amap = amap; > + f->ft_off = off; > +} > > - if ((f == NULL) && (flags & FT_CREATE)) { > - /* > - * We rely on the rwlock to ensure that no other thread > - * create the same futex. > - */ > - f = pool_get(&ftpool, PR_WAITOK); > - TAILQ_INIT(&f->ft_threads); > - f->ft_obj = obj; > - f->ft_amap = amap; > - f->ft_off = off; > - f->ft_refcnt = 1; > - LIST_INSERT_HEAD(ftlist, f, ft_list); > - } > +static inline struct futex_slpque * > +futex_get_slpque(struct futex *f) > +{ > + uint32_t key = f->ft_off >> 3; /* watevs */ > + key ^= key >> FUTEX_SLPQUES_BITS; > > - return f; > + return (&futex_slpques[key & FUTEX_SLPQUES_MASK]); > } > > -/* > - * Release a given futex. > - */ > -void > -futex_put(struct futex *f) > +static int > +futex_unwait(struct futex_slpque *ofsq, struct futex *f) > { > - rw_assert_wrlock(&ftlock); > + struct futex_slpque *fsq; > + int rv; > > - KASSERT(f->ft_refcnt > 0); > + /* > + * REQUEUE can move a futex between buckets, so follow it if needed. > + */ > > - --f->ft_refcnt; > - if (f->ft_refcnt == 0) { > - KASSERT(TAILQ_EMPTY(&f->ft_threads)); > - LIST_REMOVE(f, ft_list); > - pool_put(&ftpool, f); > + for (;;) { > + rw_enter_write(&ofsq->fsq_lock); > + fsq = futex_get_slpque(f); > + if (ofsq == fsq) > + break; > + > + rw_exit_write(&ofsq->fsq_lock); > + ofsq = fsq; > } > + > + rv = f->ft_proc != NULL; > + if (rv) > + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry); > + rw_exit_write(&fsq->fsq_lock); > + > + return (rv); > } > > /* > @@ -203,34 +245,19 @@ futex_put(struct futex *f) > * ``uaddr''. Let it sleep for the specified ``timeout'' time, or > * indefinitely if the argument is NULL. > */ > -int > -futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout, > - int flags) > +static int > +futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val, > + const struct timespec *timeout, int flags) > { > - struct proc *p = curproc; > - struct futex *f; > - uint64_t nsecs = INFSLP; > + struct futex f; > + struct futex_slpque *fsq; > + uint64_t to_ticks = 0; > uint32_t cval; > int error; > > - /* > - * After reading the value a race is still possible but > - * we deal with it by serializing all futex syscalls. > - */ > - rw_assert_wrlock(&ftlock); > - > - /* > - * Read user space futex value > - */ > - if ((error = copyin32(uaddr, &cval))) > - return error; > - > - /* If the value changed, stop here. */ > - if (cval != val) > - return EAGAIN; > - > if (timeout != NULL) { > struct timespec ts; > + uint64_t nsecs; > > if ((error = copyin(timeout, &ts, sizeof(ts)))) > return error; > @@ -240,32 +267,85 @@ futex_wait(uint32_t *uaddr, uint32_t val > #endif > if (ts.tv_sec < 0 || !timespecisvalid(&ts)) > return EINVAL; > + > nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP)); > + to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; > + if (to_ticks > INT_MAX) > + to_ticks = INT_MAX; > } > > - f = futex_get(uaddr, flags | FT_CREATE); > - TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); > - p->p_futex = f; > - > - error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs); > - if (error == ERESTART) > - error = ECANCELED; > - else if (error == EWOULDBLOCK) { > - /* A race occurred between a wakeup and a timeout. */ > - if (p->p_futex == NULL) > - error = 0; > - else > - error = ETIMEDOUT; > + futex_addrs(p, &f, uaddr, flags); > + fsq = futex_get_slpque(&f); > + > + /* Mark futex as waiting. */ > + f.ft_proc = p; > + rw_enter_write(&fsq->fsq_lock); > + /* Make the waiting futex visible to wake/requeue */ > + TAILQ_INSERT_TAIL(&fsq->fsq_list, &f, ft_entry); > + rw_exit_write(&fsq->fsq_lock); > + > + /* > + * Do not return before f has been removed from the slpque! > + */ > + > + /* > + * Read user space futex value > + */ > + if ((error = copyin32(uaddr, &cval)) != 0) > + goto exit; > + > + /* If the value changed, stop here. */ > + if (cval != val) { > + error = EAGAIN; > + goto exit; > } > > + sleep_setup(&f, PWAIT|PCATCH, "fsleep"); > + error = sleep_finish(to_ticks, f.ft_proc != NULL); > /* Remove ourself if we haven't been awaken. */ > - if ((f = p->p_futex) != NULL) { > - p->p_futex = NULL; > - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); > - futex_put(f); > + if (error != 0 || f.ft_proc != NULL) { > + if (futex_unwait(fsq, &f) == 0) > + error = 0; > + > + switch (error) { > + case ERESTART: > + error = ECANCELED; > + break; > + case EWOULDBLOCK: > + error = ETIMEDOUT; > + break; > + default: > + break; > + } > } > > return error; > +exit: > + if (f.ft_proc != NULL) > + futex_unwait(fsq, &f); > + return error; > +} > + > +static void > +futex_list_wakeup(struct futex_list *fl) > +{ > + struct futex *f, *nf; > + struct proc *p; > + > + /* > + * Setting ft_proc to NULL releases the futex reference > + * currently held via the slpque lock. > + * > + * SCHED_LOCK is only needed to call wakeup_proc. > + */ > + > + SCHED_LOCK(); > + TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) { > + p = f->ft_proc; > + f->ft_proc = NULL; > + wakeup_proc(p); > + } > + SCHED_UNLOCK(); > } > > /* > @@ -273,46 +353,133 @@ futex_wait(uint32_t *uaddr, uint32_t val > * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at > * address ``uaddr2''. > */ > -int > -futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m, > - int flags) > +static int > +futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n, > + uint32_t *uaddr2, uint32_t m, int flags, register_t *retval) > { > - struct futex *f, *g; > - struct proc *p; > + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl); > + struct futex okey, nkey; > + struct futex *f, *nf, *mf = NULL; > + struct futex_slpque *ofsq, *nfsq; > uint32_t count = 0; > > - rw_assert_wrlock(&ftlock); > + if (m == 0) > + return futex_wake(p, uaddr, n, flags, retval); > > - f = futex_get(uaddr, flags); > - if (f == NULL) > - return 0; > + futex_addrs(p, &okey, uaddr, flags); > + ofsq = futex_get_slpque(&okey); > + futex_addrs(p, &nkey, uaddr2, flags); > + nfsq = futex_get_slpque(&nkey); > + > + if (ofsq->fsq_id < nfsq->fsq_id) { > + rw_enter_write(&ofsq->fsq_lock); > + rw_enter_write(&nfsq->fsq_lock); > + } else if (ofsq->fsq_id > nfsq->fsq_id) { > + rw_enter_write(&nfsq->fsq_lock); > + rw_enter_write(&ofsq->fsq_lock); > + } else > + rw_enter_write(&ofsq->fsq_lock); > + > + TAILQ_FOREACH_SAFE(f, &ofsq->fsq_list, ft_entry, nf) { > + /* __builtin_prefetch(nf, 1); */ > + KASSERT(f->ft_proc != NULL); > > - while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { > - p->p_futex = NULL; > - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); > - futex_put(f); > - > - if (count < n) { > - wakeup_one(p); > - } else if (uaddr2 != NULL) { > - g = futex_get(uaddr2, FT_CREATE); > - TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); > - p->p_futex = g; > + if (!futex_is_eq(f, &okey)) > + continue; > + > + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry); > + TAILQ_INSERT_TAIL(&fl, f, ft_entry); > + > + if (++count == n) { > + mf = nf; > + break; > } > - count++; > } > > - futex_put(f); > + if (!TAILQ_EMPTY(&fl)) > + futex_list_wakeup(&fl); > > - return count; > + /* update matching futexes */ > + if (mf != NULL) { > + /* > + * only iterate from the current entry to the tail > + * of the list as it is now in case we're requeueing > + * on the end of the same list. > + */ > + nf = TAILQ_LAST(&ofsq->fsq_list, futex_list); > + do { > + f = mf; > + mf = TAILQ_NEXT(f, ft_entry); > + /* __builtin_prefetch(mf, 1); */ > + > + KASSERT(f->ft_proc != NULL); > + > + if (!futex_is_eq(f, &okey)) > + continue; > + > + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry); > + f->ft_ps = nkey.ft_ps; > + f->ft_obj = nkey.ft_obj; > + f->ft_amap = nkey.ft_amap; > + f->ft_off = nkey.ft_off; > + TAILQ_INSERT_TAIL(&nfsq->fsq_list, f, ft_entry); > + > + if (--m == 0) > + break; > + } while (f != nf); > + } > + > + if (ofsq->fsq_id != nfsq->fsq_id) > + rw_exit_write(&nfsq->fsq_lock); > + rw_exit_write(&ofsq->fsq_lock); > + > + *retval = count; > + return 0; > } > > /* > * Wakeup at most ``n'' sibling threads sleeping on a futex at address > * ``uaddr''. > */ > -int > -futex_wake(uint32_t *uaddr, uint32_t n, int flags) > +static int > +futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags, > + register_t *retval) > { > - return futex_requeue(uaddr, n, NULL, 0, flags); > + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl); > + struct futex key; > + struct futex *f, *nf; > + struct futex_slpque *fsq; > + int count = 0; > + > + if (n == 0) { > + *retval = 0; > + return 0; > + } > + > + futex_addrs(p, &key, uaddr, flags); > + fsq = futex_get_slpque(&key); > + > + rw_enter_write(&fsq->fsq_lock); > + > + TAILQ_FOREACH_SAFE(f, &fsq->fsq_list, ft_entry, nf) { > + /* __builtin_prefetch(nf, 1); */ > + KASSERT(f->ft_proc != NULL); > + > + if (!futex_is_eq(f, &key)) > + continue; > + > + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry); > + TAILQ_INSERT_TAIL(&fl, f, ft_entry); > + > + if (++count == n) > + break; > + } > + > + if (!TAILQ_EMPTY(&fl)) > + futex_list_wakeup(&fl); > + > + rw_exit_write(&fsq->fsq_lock); > + > + *retval = count; > + return 0; > } >