Download raw body.
avoid lock contention in futex syscalls
On 02/05/25(Fri) 12:24, David Gwynne wrote:
> On Fri, May 02, 2025 at 10:10:42AM +1000, David Gwynne wrote:
> >
> > if i get time at work i'll try and update the code today.
>
> i think this addresses most of the stuff mpi@ asked about.
>
> renaming struct futexen to futex_list collided with the futex bits in
> proc.h, so i removed them. they're unecessary now anyway.
One tweak, we try to keep capital letter [F] in locking comments for
global locks. Since here it is a per-sleepqueue lock, I'd suggest
using [f].
Ok with me.
> Index: sys/futex.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/futex.h,v
> diff -u -p -r1.2 futex.h
> --- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2
> +++ sys/futex.h 2 May 2025 02:19:46 -0000
> @@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int,
> __END_DECLS
> #endif /* ! _KERNEL */
>
> +#define FUTEX_OP_MASK 0x007f
> +
> #define FUTEX_WAIT 1
> #define FUTEX_WAKE 2
> #define FUTEX_REQUEUE 3
>
> -#define FUTEX_PRIVATE_FLAG 128
> +#define FUTEX_FLAG_MASK 0xff80
> +
> +#define FUTEX_PRIVATE_FLAG 0x0080
>
> #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
> #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
> Index: sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> diff -u -p -r1.386 proc.h
> --- sys/proc.h 1 May 2025 01:16:42 -0000 1.386
> +++ sys/proc.h 2 May 2025 02:19:47 -0000
> @@ -116,8 +125,6 @@ struct tusage {
> * run-time information needed by threads.
> */
> #ifdef __need_process
> -struct futex;
> -LIST_HEAD(futex_list, futex);
> struct proc;
> struct tslpentry;
> TAILQ_HEAD(tslpqueue, tslpentry);
> @@ -178,7 +185,6 @@ struct process {
> struct vmspace *ps_vmspace; /* Address space */
> pid_t ps_pid; /* [I] Process identifier. */
>
> - struct futex_list ps_ftlist; /* futexes attached to this process */
> struct tslpqueue ps_tslpqueue; /* [p] queue of threads in thrsleep */
> struct rwlock ps_lock; /* per-process rwlock */
> struct mutex ps_mtx; /* per-process mutex */
> @@ -343,9 +349,6 @@ struct proc {
>
> struct process *p_p; /* [I] The process of this thread. */
> TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */
> -
> - TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */
> - struct futex *p_futex; /* Current sleeping futex. */
>
> /* substructures: */
> struct filedesc *p_fd; /* copy of p_p->ps_fd */
> Index: kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> diff -u -p -r1.270 kern_fork.c
> --- kern/kern_fork.c 14 Apr 2025 09:15:24 -0000 1.270
> +++ kern/kern_fork.c 2 May 2025 02:19:47 -0000
> @@ -196,7 +196,6 @@ process_initialize(struct process *pr, s
>
> LIST_INIT(&pr->ps_children);
> LIST_INIT(&pr->ps_orphans);
> - LIST_INIT(&pr->ps_ftlist);
> LIST_INIT(&pr->ps_sigiolst);
> TAILQ_INIT(&pr->ps_tslpqueue);
>
> Index: kern/sys_futex.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sys_futex.c,v
> diff -u -p -r1.22 sys_futex.c
> --- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22
> +++ kern/sys_futex.c 2 May 2025 02:19:47 -0000
> @@ -24,6 +24,8 @@
> #include <sys/pool.h>
> #include <sys/time.h>
> #include <sys/rwlock.h>
> +#include <sys/percpu.h> /* CACHELINESIZE */
> +#include <sys/kernel.h> /* tick_nsec */
> #include <sys/futex.h>
>
> #ifdef KTRACE
> @@ -33,44 +35,98 @@
> #include <uvm/uvm.h>
>
> /*
> + * Locks used to protect variables in this file:
> + *
> + * I immutable after initialization
> + * F futex_slpque fsq_lock
> + */
> +
> +/*
> * Kernel representation of a futex.
> + *
> + * The userland address that the futex is waiting on is represented by
> + * ft_ps, ft_obj, ft_amap, and ft_off.
> + *
> + * Whether the futex is waiting or woken up is represented by the
> + * ft_proc pointer being set (ie, not NULL) or not (ie, NULL) respectively.
> + * When the futex is waiting it is referenced by the list in a
> + * futex_slpque. When the futex gets woken up, it is removed from the
> + * list and the ft_proc pointer is cleared to indicate that the reference
> + * held by the list has been released. Only a thread holding the lock
> + * may remove the futex from the list and clear ft_proc. This is true
> + * even for futex_wait().
> + *
> + * However, futex_wait() may read ft_proc without the lock so it can
> + * avoid contending with the thread that just woke it up. This means
> + * that once ft_proc is cleared, futex_wait() may return, the struct
> + * futex will no longer exist, and it is no longer safe to access it
> + * from the wakeup side.
> + *
> + * tl;dr: the thread holding the slpque lock "owns" the references
> + * to the futexes on the list until it clears ft_proc.
> */
> +
> struct futex {
> - LIST_ENTRY(futex) ft_list; /* list of all futexes */
> - TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */
> - struct uvm_object *ft_obj; /* UVM object */
> - struct vm_amap *ft_amap; /* UVM amap */
> - voff_t ft_off; /* UVM offset */
> - unsigned int ft_refcnt; /* # of references */
> + TAILQ_ENTRY(futex) ft_entry; /* [F] entry on futex_slpque */
> +
> + struct process *ft_ps; /* [I] for private futexes */
> + struct uvm_object *ft_obj; /* [F] UVM object */
> + struct vm_amap *ft_amap; /* [F] UVM amap */
> + volatile voff_t ft_off; /* [F] UVM offset */
> +
> + struct proc * volatile ft_proc; /* [F] waiting thread */
> };
>
> -/* Syscall helpers. */
> -int futex_wait(uint32_t *, uint32_t, const struct timespec *, int);
> -int futex_wake(uint32_t *, uint32_t, int);
> -int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int);
> -
> -/* Flags for futex_get(). */
> -#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */
> -#define FT_PRIVATE 0x2 /* Futex is process-private. */
> +static int
> +futex_is_eq(const struct futex *a, const struct futex *b)
> +{
> + return (a->ft_off == b->ft_off &&
> + a->ft_ps == b->ft_ps &&
> + a->ft_obj == b->ft_obj &&
> + a->ft_amap == b->ft_amap);
> +}
>
> -struct futex *futex_get(uint32_t *, int);
> -void futex_put(struct futex *);
> +TAILQ_HEAD(futex_list, futex);
>
> -/*
> - * The global futex lock serializes futex(2) calls so that no wakeup
> - * event is lost, and protects all futex lists and futex states.
> - */
> -struct rwlock ftlock = RWLOCK_INITIALIZER("futex");
> -static struct futex_list ftlist_shared =
> - LIST_HEAD_INITIALIZER(ftlist_shared);
> -struct pool ftpool;
> +struct futex_slpque {
> + struct futex_list fsq_list; /* [F] */
> + struct rwlock fsq_lock;
> + uint32_t fsq_id; /* [I] for lock ordering */
> +} __aligned(CACHELINESIZE);
> +
> +/* Syscall helpers. */
> +static int futex_wait(struct proc *, uint32_t *, uint32_t,
> + const struct timespec *, int);
> +static int futex_wake(struct proc *, uint32_t *, uint32_t, int,
> + register_t *);
> +static int futex_requeue(struct proc *, uint32_t *, uint32_t,
> + uint32_t *, uint32_t, int, register_t *);
> +
> +/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */
> +#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */
> +
> +#define FUTEX_SLPQUES_BITS 6
> +#define FUTEX_SLPQUES_SIZE (1U << FUTEX_SLPQUES_BITS)
> +#define FUTEX_SLPQUES_MASK (FUTEX_SLPQUES_SIZE - 1)
>
> +static struct futex_slpque futex_slpques[FUTEX_SLPQUES_SIZE];
>
> void
> futex_init(void)
> {
> - pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE,
> - PR_WAITOK | PR_RWLOCK, "futexpl", NULL);
> + struct futex_slpque *fsq;
> + unsigned int i;
> +
> + for (i = 0; i < nitems(futex_slpques); i++) {
> + fsq = &futex_slpques[i];
> +
> + TAILQ_INIT(&fsq->fsq_list);
> + rw_init(&fsq->fsq_lock, "futexlk");
> +
> + fsq->fsq_id = arc4random();
> + fsq->fsq_id &= ~FUTEX_SLPQUES_MASK;
> + fsq->fsq_id |= i;
> + }
> }
>
> int
> @@ -88,65 +144,51 @@ sys_futex(struct proc *p, void *v, regis
> uint32_t val = SCARG(uap, val);
> const struct timespec *timeout = SCARG(uap, timeout);
> void *g = SCARG(uap, g);
> - int flags = 0;
> + int flags = op & FUTEX_FLAG_MASK;
> int error = 0;
>
> - if (op & FUTEX_PRIVATE_FLAG)
> - flags |= FT_PRIVATE;
> -
> - rw_enter_write(&ftlock);
> - switch (op) {
> + switch (op & FUTEX_OP_MASK) {
> case FUTEX_WAIT:
> - case FUTEX_WAIT_PRIVATE:
> - error = futex_wait(uaddr, val, timeout, flags);
> + error = futex_wait(p, uaddr, val, timeout, flags);
> break;
> case FUTEX_WAKE:
> - case FUTEX_WAKE_PRIVATE:
> - *retval = futex_wake(uaddr, val, flags);
> + error = futex_wake(p, uaddr, val, flags, retval);
> break;
> case FUTEX_REQUEUE:
> - case FUTEX_REQUEUE_PRIVATE:
> - *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags);
> + error = futex_requeue(p, uaddr, val, g,
> + (u_long)timeout, flags, retval);
> break;
> default:
> error = ENOSYS;
> break;
> }
> - rw_exit_write(&ftlock);
>
> return error;
> }
>
> -/*
> - * Return an existing futex matching userspace address ``uaddr''.
> - *
> - * If such futex does not exist and FT_CREATE is given, create it.
> - */
> -struct futex *
> -futex_get(uint32_t *uaddr, int flags)
> +static void
> +futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags)
> {
> - struct proc *p = curproc;
> vm_map_t map = &p->p_vmspace->vm_map;
> vm_map_entry_t entry;
> struct uvm_object *obj = NULL;
> struct vm_amap *amap = NULL;
> voff_t off = (vaddr_t)uaddr;
> - struct futex *f;
> - struct futex_list *ftlist = &p->p_p->ps_ftlist;
> + struct process *ps;
>
> - rw_assert_wrlock(&ftlock);
> + if (ISSET(flags, FT_PRIVATE))
> + ps = p->p_p;
> + else {
> + ps = NULL;
>
> - if (!(flags & FT_PRIVATE)) {
> vm_map_lock_read(map);
> if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) &&
> entry->inheritance == MAP_INHERIT_SHARE) {
> if (UVM_ET_ISOBJ(entry)) {
> - ftlist = &ftlist_shared;
> obj = entry->object.uvm_obj;
> off = entry->offset +
> ((vaddr_t)uaddr - entry->start);
> } else if (entry->aref.ar_amap) {
> - ftlist = &ftlist_shared;
> amap = entry->aref.ar_amap;
> off = ptoa(entry->aref.ar_pageoff) +
> ((vaddr_t)uaddr - entry->start);
> @@ -155,47 +197,47 @@ futex_get(uint32_t *uaddr, int flags)
> vm_map_unlock_read(map);
> }
>
> - LIST_FOREACH(f, ftlist, ft_list) {
> - if (f->ft_obj == obj && f->ft_amap == amap &&
> - f->ft_off == off) {
> - f->ft_refcnt++;
> - break;
> - }
> - }
> + f->ft_ps = ps;
> + f->ft_obj = obj;
> + f->ft_amap = amap;
> + f->ft_off = off;
> +}
>
> - if ((f == NULL) && (flags & FT_CREATE)) {
> - /*
> - * We rely on the rwlock to ensure that no other thread
> - * create the same futex.
> - */
> - f = pool_get(&ftpool, PR_WAITOK);
> - TAILQ_INIT(&f->ft_threads);
> - f->ft_obj = obj;
> - f->ft_amap = amap;
> - f->ft_off = off;
> - f->ft_refcnt = 1;
> - LIST_INSERT_HEAD(ftlist, f, ft_list);
> - }
> +static inline struct futex_slpque *
> +futex_get_slpque(struct futex *f)
> +{
> + uint32_t key = f->ft_off >> 3; /* watevs */
> + key ^= key >> FUTEX_SLPQUES_BITS;
>
> - return f;
> + return (&futex_slpques[key & FUTEX_SLPQUES_MASK]);
> }
>
> -/*
> - * Release a given futex.
> - */
> -void
> -futex_put(struct futex *f)
> +static int
> +futex_unwait(struct futex_slpque *ofsq, struct futex *f)
> {
> - rw_assert_wrlock(&ftlock);
> + struct futex_slpque *fsq;
> + int rv;
>
> - KASSERT(f->ft_refcnt > 0);
> + /*
> + * REQUEUE can move a futex between buckets, so follow it if needed.
> + */
>
> - --f->ft_refcnt;
> - if (f->ft_refcnt == 0) {
> - KASSERT(TAILQ_EMPTY(&f->ft_threads));
> - LIST_REMOVE(f, ft_list);
> - pool_put(&ftpool, f);
> + for (;;) {
> + rw_enter_write(&ofsq->fsq_lock);
> + fsq = futex_get_slpque(f);
> + if (ofsq == fsq)
> + break;
> +
> + rw_exit_write(&ofsq->fsq_lock);
> + ofsq = fsq;
> }
> +
> + rv = f->ft_proc != NULL;
> + if (rv)
> + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
> + rw_exit_write(&fsq->fsq_lock);
> +
> + return (rv);
> }
>
> /*
> @@ -203,34 +245,19 @@ futex_put(struct futex *f)
> * ``uaddr''. Let it sleep for the specified ``timeout'' time, or
> * indefinitely if the argument is NULL.
> */
> -int
> -futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout,
> - int flags)
> +static int
> +futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val,
> + const struct timespec *timeout, int flags)
> {
> - struct proc *p = curproc;
> - struct futex *f;
> - uint64_t nsecs = INFSLP;
> + struct futex f;
> + struct futex_slpque *fsq;
> + uint64_t to_ticks = 0;
> uint32_t cval;
> int error;
>
> - /*
> - * After reading the value a race is still possible but
> - * we deal with it by serializing all futex syscalls.
> - */
> - rw_assert_wrlock(&ftlock);
> -
> - /*
> - * Read user space futex value
> - */
> - if ((error = copyin32(uaddr, &cval)))
> - return error;
> -
> - /* If the value changed, stop here. */
> - if (cval != val)
> - return EAGAIN;
> -
> if (timeout != NULL) {
> struct timespec ts;
> + uint64_t nsecs;
>
> if ((error = copyin(timeout, &ts, sizeof(ts))))
> return error;
> @@ -240,32 +267,85 @@ futex_wait(uint32_t *uaddr, uint32_t val
> #endif
> if (ts.tv_sec < 0 || !timespecisvalid(&ts))
> return EINVAL;
> +
> nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP));
> + to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
> + if (to_ticks > INT_MAX)
> + to_ticks = INT_MAX;
> }
>
> - f = futex_get(uaddr, flags | FT_CREATE);
> - TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link);
> - p->p_futex = f;
> -
> - error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs);
> - if (error == ERESTART)
> - error = ECANCELED;
> - else if (error == EWOULDBLOCK) {
> - /* A race occurred between a wakeup and a timeout. */
> - if (p->p_futex == NULL)
> - error = 0;
> - else
> - error = ETIMEDOUT;
> + futex_addrs(p, &f, uaddr, flags);
> + fsq = futex_get_slpque(&f);
> +
> + /* Mark futex as waiting. */
> + f.ft_proc = p;
> + rw_enter_write(&fsq->fsq_lock);
> + /* Make the waiting futex visible to wake/requeue */
> + TAILQ_INSERT_TAIL(&fsq->fsq_list, &f, ft_entry);
> + rw_exit_write(&fsq->fsq_lock);
> +
> + /*
> + * Do not return before f has been removed from the slpque!
> + */
> +
> + /*
> + * Read user space futex value
> + */
> + if ((error = copyin32(uaddr, &cval)) != 0)
> + goto exit;
> +
> + /* If the value changed, stop here. */
> + if (cval != val) {
> + error = EAGAIN;
> + goto exit;
> }
>
> + sleep_setup(&f, PWAIT|PCATCH, "fsleep");
> + error = sleep_finish(to_ticks, f.ft_proc != NULL);
> /* Remove ourself if we haven't been awaken. */
> - if ((f = p->p_futex) != NULL) {
> - p->p_futex = NULL;
> - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
> - futex_put(f);
> + if (error != 0 || f.ft_proc != NULL) {
> + if (futex_unwait(fsq, &f) == 0)
> + error = 0;
> +
> + switch (error) {
> + case ERESTART:
> + error = ECANCELED;
> + break;
> + case EWOULDBLOCK:
> + error = ETIMEDOUT;
> + break;
> + default:
> + break;
> + }
> }
>
> return error;
> +exit:
> + if (f.ft_proc != NULL)
> + futex_unwait(fsq, &f);
> + return error;
> +}
> +
> +static void
> +futex_list_wakeup(struct futex_list *fl)
> +{
> + struct futex *f, *nf;
> + struct proc *p;
> +
> + /*
> + * Setting ft_proc to NULL releases the futex reference
> + * currently held via the slpque lock.
> + *
> + * SCHED_LOCK is only needed to call wakeup_proc.
> + */
> +
> + SCHED_LOCK();
> + TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) {
> + p = f->ft_proc;
> + f->ft_proc = NULL;
> + wakeup_proc(p);
> + }
> + SCHED_UNLOCK();
> }
>
> /*
> @@ -273,46 +353,133 @@ futex_wait(uint32_t *uaddr, uint32_t val
> * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
> * address ``uaddr2''.
> */
> -int
> -futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m,
> - int flags)
> +static int
> +futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n,
> + uint32_t *uaddr2, uint32_t m, int flags, register_t *retval)
> {
> - struct futex *f, *g;
> - struct proc *p;
> + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
> + struct futex okey, nkey;
> + struct futex *f, *nf, *mf = NULL;
> + struct futex_slpque *ofsq, *nfsq;
> uint32_t count = 0;
>
> - rw_assert_wrlock(&ftlock);
> + if (m == 0)
> + return futex_wake(p, uaddr, n, flags, retval);
>
> - f = futex_get(uaddr, flags);
> - if (f == NULL)
> - return 0;
> + futex_addrs(p, &okey, uaddr, flags);
> + ofsq = futex_get_slpque(&okey);
> + futex_addrs(p, &nkey, uaddr2, flags);
> + nfsq = futex_get_slpque(&nkey);
> +
> + if (ofsq->fsq_id < nfsq->fsq_id) {
> + rw_enter_write(&ofsq->fsq_lock);
> + rw_enter_write(&nfsq->fsq_lock);
> + } else if (ofsq->fsq_id > nfsq->fsq_id) {
> + rw_enter_write(&nfsq->fsq_lock);
> + rw_enter_write(&ofsq->fsq_lock);
> + } else
> + rw_enter_write(&ofsq->fsq_lock);
> +
> + TAILQ_FOREACH_SAFE(f, &ofsq->fsq_list, ft_entry, nf) {
> + /* __builtin_prefetch(nf, 1); */
> + KASSERT(f->ft_proc != NULL);
>
> - while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) {
> - p->p_futex = NULL;
> - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
> - futex_put(f);
> -
> - if (count < n) {
> - wakeup_one(p);
> - } else if (uaddr2 != NULL) {
> - g = futex_get(uaddr2, FT_CREATE);
> - TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link);
> - p->p_futex = g;
> + if (!futex_is_eq(f, &okey))
> + continue;
> +
> + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
> + TAILQ_INSERT_TAIL(&fl, f, ft_entry);
> +
> + if (++count == n) {
> + mf = nf;
> + break;
> }
> - count++;
> }
>
> - futex_put(f);
> + if (!TAILQ_EMPTY(&fl))
> + futex_list_wakeup(&fl);
>
> - return count;
> + /* update matching futexes */
> + if (mf != NULL) {
> + /*
> + * only iterate from the current entry to the tail
> + * of the list as it is now in case we're requeueing
> + * on the end of the same list.
> + */
> + nf = TAILQ_LAST(&ofsq->fsq_list, futex_list);
> + do {
> + f = mf;
> + mf = TAILQ_NEXT(f, ft_entry);
> + /* __builtin_prefetch(mf, 1); */
> +
> + KASSERT(f->ft_proc != NULL);
> +
> + if (!futex_is_eq(f, &okey))
> + continue;
> +
> + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
> + f->ft_ps = nkey.ft_ps;
> + f->ft_obj = nkey.ft_obj;
> + f->ft_amap = nkey.ft_amap;
> + f->ft_off = nkey.ft_off;
> + TAILQ_INSERT_TAIL(&nfsq->fsq_list, f, ft_entry);
> +
> + if (--m == 0)
> + break;
> + } while (f != nf);
> + }
> +
> + if (ofsq->fsq_id != nfsq->fsq_id)
> + rw_exit_write(&nfsq->fsq_lock);
> + rw_exit_write(&ofsq->fsq_lock);
> +
> + *retval = count;
> + return 0;
> }
>
> /*
> * Wakeup at most ``n'' sibling threads sleeping on a futex at address
> * ``uaddr''.
> */
> -int
> -futex_wake(uint32_t *uaddr, uint32_t n, int flags)
> +static int
> +futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags,
> + register_t *retval)
> {
> - return futex_requeue(uaddr, n, NULL, 0, flags);
> + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
> + struct futex key;
> + struct futex *f, *nf;
> + struct futex_slpque *fsq;
> + int count = 0;
> +
> + if (n == 0) {
> + *retval = 0;
> + return 0;
> + }
> +
> + futex_addrs(p, &key, uaddr, flags);
> + fsq = futex_get_slpque(&key);
> +
> + rw_enter_write(&fsq->fsq_lock);
> +
> + TAILQ_FOREACH_SAFE(f, &fsq->fsq_list, ft_entry, nf) {
> + /* __builtin_prefetch(nf, 1); */
> + KASSERT(f->ft_proc != NULL);
> +
> + if (!futex_is_eq(f, &key))
> + continue;
> +
> + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
> + TAILQ_INSERT_TAIL(&fl, f, ft_entry);
> +
> + if (++count == n)
> + break;
> + }
> +
> + if (!TAILQ_EMPTY(&fl))
> + futex_list_wakeup(&fl);
> +
> + rw_exit_write(&fsq->fsq_lock);
> +
> + *retval = count;
> + return 0;
> }
>
avoid lock contention in futex syscalls