Download raw body.
avoid lock contention in futex syscalls
On Fri, May 02, 2025 at 10:10:42AM +1000, David Gwynne wrote:
>
> if i get time at work i'll try and update the code today.
i think this addresses most of the stuff mpi@ asked about.
renaming struct futexen to futex_list collided with the futex bits in
proc.h, so i removed them. they're unecessary now anyway.
Index: sys/futex.h
===================================================================
RCS file: /cvs/src/sys/sys/futex.h,v
diff -u -p -r1.2 futex.h
--- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2
+++ sys/futex.h 2 May 2025 02:19:46 -0000
@@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int,
__END_DECLS
#endif /* ! _KERNEL */
+#define FUTEX_OP_MASK 0x007f
+
#define FUTEX_WAIT 1
#define FUTEX_WAKE 2
#define FUTEX_REQUEUE 3
-#define FUTEX_PRIVATE_FLAG 128
+#define FUTEX_FLAG_MASK 0xff80
+
+#define FUTEX_PRIVATE_FLAG 0x0080
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
Index: sys/proc.h
===================================================================
RCS file: /cvs/src/sys/sys/proc.h,v
diff -u -p -r1.386 proc.h
--- sys/proc.h 1 May 2025 01:16:42 -0000 1.386
+++ sys/proc.h 2 May 2025 02:19:47 -0000
@@ -116,8 +125,6 @@ struct tusage {
* run-time information needed by threads.
*/
#ifdef __need_process
-struct futex;
-LIST_HEAD(futex_list, futex);
struct proc;
struct tslpentry;
TAILQ_HEAD(tslpqueue, tslpentry);
@@ -178,7 +185,6 @@ struct process {
struct vmspace *ps_vmspace; /* Address space */
pid_t ps_pid; /* [I] Process identifier. */
- struct futex_list ps_ftlist; /* futexes attached to this process */
struct tslpqueue ps_tslpqueue; /* [p] queue of threads in thrsleep */
struct rwlock ps_lock; /* per-process rwlock */
struct mutex ps_mtx; /* per-process mutex */
@@ -343,9 +349,6 @@ struct proc {
struct process *p_p; /* [I] The process of this thread. */
TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */
-
- TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */
- struct futex *p_futex; /* Current sleeping futex. */
/* substructures: */
struct filedesc *p_fd; /* copy of p_p->ps_fd */
Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_fork.c,v
diff -u -p -r1.270 kern_fork.c
--- kern/kern_fork.c 14 Apr 2025 09:15:24 -0000 1.270
+++ kern/kern_fork.c 2 May 2025 02:19:47 -0000
@@ -196,7 +196,6 @@ process_initialize(struct process *pr, s
LIST_INIT(&pr->ps_children);
LIST_INIT(&pr->ps_orphans);
- LIST_INIT(&pr->ps_ftlist);
LIST_INIT(&pr->ps_sigiolst);
TAILQ_INIT(&pr->ps_tslpqueue);
Index: kern/sys_futex.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_futex.c,v
diff -u -p -r1.22 sys_futex.c
--- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22
+++ kern/sys_futex.c 2 May 2025 02:19:47 -0000
@@ -24,6 +24,8 @@
#include <sys/pool.h>
#include <sys/time.h>
#include <sys/rwlock.h>
+#include <sys/percpu.h> /* CACHELINESIZE */
+#include <sys/kernel.h> /* tick_nsec */
#include <sys/futex.h>
#ifdef KTRACE
@@ -33,44 +35,98 @@
#include <uvm/uvm.h>
/*
+ * Locks used to protect variables in this file:
+ *
+ * I immutable after initialization
+ * F futex_slpque fsq_lock
+ */
+
+/*
* Kernel representation of a futex.
+ *
+ * The userland address that the futex is waiting on is represented by
+ * ft_ps, ft_obj, ft_amap, and ft_off.
+ *
+ * Whether the futex is waiting or woken up is represented by the
+ * ft_proc pointer being set (ie, not NULL) or not (ie, NULL) respectively.
+ * When the futex is waiting it is referenced by the list in a
+ * futex_slpque. When the futex gets woken up, it is removed from the
+ * list and the ft_proc pointer is cleared to indicate that the reference
+ * held by the list has been released. Only a thread holding the lock
+ * may remove the futex from the list and clear ft_proc. This is true
+ * even for futex_wait().
+ *
+ * However, futex_wait() may read ft_proc without the lock so it can
+ * avoid contending with the thread that just woke it up. This means
+ * that once ft_proc is cleared, futex_wait() may return, the struct
+ * futex will no longer exist, and it is no longer safe to access it
+ * from the wakeup side.
+ *
+ * tl;dr: the thread holding the slpque lock "owns" the references
+ * to the futexes on the list until it clears ft_proc.
*/
+
struct futex {
- LIST_ENTRY(futex) ft_list; /* list of all futexes */
- TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */
- struct uvm_object *ft_obj; /* UVM object */
- struct vm_amap *ft_amap; /* UVM amap */
- voff_t ft_off; /* UVM offset */
- unsigned int ft_refcnt; /* # of references */
+ TAILQ_ENTRY(futex) ft_entry; /* [F] entry on futex_slpque */
+
+ struct process *ft_ps; /* [I] for private futexes */
+ struct uvm_object *ft_obj; /* [F] UVM object */
+ struct vm_amap *ft_amap; /* [F] UVM amap */
+ volatile voff_t ft_off; /* [F] UVM offset */
+
+ struct proc * volatile ft_proc; /* [F] waiting thread */
};
-/* Syscall helpers. */
-int futex_wait(uint32_t *, uint32_t, const struct timespec *, int);
-int futex_wake(uint32_t *, uint32_t, int);
-int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int);
-
-/* Flags for futex_get(). */
-#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */
-#define FT_PRIVATE 0x2 /* Futex is process-private. */
+static int
+futex_is_eq(const struct futex *a, const struct futex *b)
+{
+ return (a->ft_off == b->ft_off &&
+ a->ft_ps == b->ft_ps &&
+ a->ft_obj == b->ft_obj &&
+ a->ft_amap == b->ft_amap);
+}
-struct futex *futex_get(uint32_t *, int);
-void futex_put(struct futex *);
+TAILQ_HEAD(futex_list, futex);
-/*
- * The global futex lock serializes futex(2) calls so that no wakeup
- * event is lost, and protects all futex lists and futex states.
- */
-struct rwlock ftlock = RWLOCK_INITIALIZER("futex");
-static struct futex_list ftlist_shared =
- LIST_HEAD_INITIALIZER(ftlist_shared);
-struct pool ftpool;
+struct futex_slpque {
+ struct futex_list fsq_list; /* [F] */
+ struct rwlock fsq_lock;
+ uint32_t fsq_id; /* [I] for lock ordering */
+} __aligned(CACHELINESIZE);
+
+/* Syscall helpers. */
+static int futex_wait(struct proc *, uint32_t *, uint32_t,
+ const struct timespec *, int);
+static int futex_wake(struct proc *, uint32_t *, uint32_t, int,
+ register_t *);
+static int futex_requeue(struct proc *, uint32_t *, uint32_t,
+ uint32_t *, uint32_t, int, register_t *);
+
+/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */
+#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */
+
+#define FUTEX_SLPQUES_BITS 6
+#define FUTEX_SLPQUES_SIZE (1U << FUTEX_SLPQUES_BITS)
+#define FUTEX_SLPQUES_MASK (FUTEX_SLPQUES_SIZE - 1)
+static struct futex_slpque futex_slpques[FUTEX_SLPQUES_SIZE];
void
futex_init(void)
{
- pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE,
- PR_WAITOK | PR_RWLOCK, "futexpl", NULL);
+ struct futex_slpque *fsq;
+ unsigned int i;
+
+ for (i = 0; i < nitems(futex_slpques); i++) {
+ fsq = &futex_slpques[i];
+
+ TAILQ_INIT(&fsq->fsq_list);
+ rw_init(&fsq->fsq_lock, "futexlk");
+
+ fsq->fsq_id = arc4random();
+ fsq->fsq_id &= ~FUTEX_SLPQUES_MASK;
+ fsq->fsq_id |= i;
+ }
}
int
@@ -88,65 +144,51 @@ sys_futex(struct proc *p, void *v, regis
uint32_t val = SCARG(uap, val);
const struct timespec *timeout = SCARG(uap, timeout);
void *g = SCARG(uap, g);
- int flags = 0;
+ int flags = op & FUTEX_FLAG_MASK;
int error = 0;
- if (op & FUTEX_PRIVATE_FLAG)
- flags |= FT_PRIVATE;
-
- rw_enter_write(&ftlock);
- switch (op) {
+ switch (op & FUTEX_OP_MASK) {
case FUTEX_WAIT:
- case FUTEX_WAIT_PRIVATE:
- error = futex_wait(uaddr, val, timeout, flags);
+ error = futex_wait(p, uaddr, val, timeout, flags);
break;
case FUTEX_WAKE:
- case FUTEX_WAKE_PRIVATE:
- *retval = futex_wake(uaddr, val, flags);
+ error = futex_wake(p, uaddr, val, flags, retval);
break;
case FUTEX_REQUEUE:
- case FUTEX_REQUEUE_PRIVATE:
- *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags);
+ error = futex_requeue(p, uaddr, val, g,
+ (u_long)timeout, flags, retval);
break;
default:
error = ENOSYS;
break;
}
- rw_exit_write(&ftlock);
return error;
}
-/*
- * Return an existing futex matching userspace address ``uaddr''.
- *
- * If such futex does not exist and FT_CREATE is given, create it.
- */
-struct futex *
-futex_get(uint32_t *uaddr, int flags)
+static void
+futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags)
{
- struct proc *p = curproc;
vm_map_t map = &p->p_vmspace->vm_map;
vm_map_entry_t entry;
struct uvm_object *obj = NULL;
struct vm_amap *amap = NULL;
voff_t off = (vaddr_t)uaddr;
- struct futex *f;
- struct futex_list *ftlist = &p->p_p->ps_ftlist;
+ struct process *ps;
- rw_assert_wrlock(&ftlock);
+ if (ISSET(flags, FT_PRIVATE))
+ ps = p->p_p;
+ else {
+ ps = NULL;
- if (!(flags & FT_PRIVATE)) {
vm_map_lock_read(map);
if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) &&
entry->inheritance == MAP_INHERIT_SHARE) {
if (UVM_ET_ISOBJ(entry)) {
- ftlist = &ftlist_shared;
obj = entry->object.uvm_obj;
off = entry->offset +
((vaddr_t)uaddr - entry->start);
} else if (entry->aref.ar_amap) {
- ftlist = &ftlist_shared;
amap = entry->aref.ar_amap;
off = ptoa(entry->aref.ar_pageoff) +
((vaddr_t)uaddr - entry->start);
@@ -155,47 +197,47 @@ futex_get(uint32_t *uaddr, int flags)
vm_map_unlock_read(map);
}
- LIST_FOREACH(f, ftlist, ft_list) {
- if (f->ft_obj == obj && f->ft_amap == amap &&
- f->ft_off == off) {
- f->ft_refcnt++;
- break;
- }
- }
+ f->ft_ps = ps;
+ f->ft_obj = obj;
+ f->ft_amap = amap;
+ f->ft_off = off;
+}
- if ((f == NULL) && (flags & FT_CREATE)) {
- /*
- * We rely on the rwlock to ensure that no other thread
- * create the same futex.
- */
- f = pool_get(&ftpool, PR_WAITOK);
- TAILQ_INIT(&f->ft_threads);
- f->ft_obj = obj;
- f->ft_amap = amap;
- f->ft_off = off;
- f->ft_refcnt = 1;
- LIST_INSERT_HEAD(ftlist, f, ft_list);
- }
+static inline struct futex_slpque *
+futex_get_slpque(struct futex *f)
+{
+ uint32_t key = f->ft_off >> 3; /* watevs */
+ key ^= key >> FUTEX_SLPQUES_BITS;
- return f;
+ return (&futex_slpques[key & FUTEX_SLPQUES_MASK]);
}
-/*
- * Release a given futex.
- */
-void
-futex_put(struct futex *f)
+static int
+futex_unwait(struct futex_slpque *ofsq, struct futex *f)
{
- rw_assert_wrlock(&ftlock);
+ struct futex_slpque *fsq;
+ int rv;
- KASSERT(f->ft_refcnt > 0);
+ /*
+ * REQUEUE can move a futex between buckets, so follow it if needed.
+ */
- --f->ft_refcnt;
- if (f->ft_refcnt == 0) {
- KASSERT(TAILQ_EMPTY(&f->ft_threads));
- LIST_REMOVE(f, ft_list);
- pool_put(&ftpool, f);
+ for (;;) {
+ rw_enter_write(&ofsq->fsq_lock);
+ fsq = futex_get_slpque(f);
+ if (ofsq == fsq)
+ break;
+
+ rw_exit_write(&ofsq->fsq_lock);
+ ofsq = fsq;
}
+
+ rv = f->ft_proc != NULL;
+ if (rv)
+ TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
+ rw_exit_write(&fsq->fsq_lock);
+
+ return (rv);
}
/*
@@ -203,34 +245,19 @@ futex_put(struct futex *f)
* ``uaddr''. Let it sleep for the specified ``timeout'' time, or
* indefinitely if the argument is NULL.
*/
-int
-futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout,
- int flags)
+static int
+futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val,
+ const struct timespec *timeout, int flags)
{
- struct proc *p = curproc;
- struct futex *f;
- uint64_t nsecs = INFSLP;
+ struct futex f;
+ struct futex_slpque *fsq;
+ uint64_t to_ticks = 0;
uint32_t cval;
int error;
- /*
- * After reading the value a race is still possible but
- * we deal with it by serializing all futex syscalls.
- */
- rw_assert_wrlock(&ftlock);
-
- /*
- * Read user space futex value
- */
- if ((error = copyin32(uaddr, &cval)))
- return error;
-
- /* If the value changed, stop here. */
- if (cval != val)
- return EAGAIN;
-
if (timeout != NULL) {
struct timespec ts;
+ uint64_t nsecs;
if ((error = copyin(timeout, &ts, sizeof(ts))))
return error;
@@ -240,32 +267,85 @@ futex_wait(uint32_t *uaddr, uint32_t val
#endif
if (ts.tv_sec < 0 || !timespecisvalid(&ts))
return EINVAL;
+
nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP));
+ to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1;
+ if (to_ticks > INT_MAX)
+ to_ticks = INT_MAX;
}
- f = futex_get(uaddr, flags | FT_CREATE);
- TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link);
- p->p_futex = f;
-
- error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs);
- if (error == ERESTART)
- error = ECANCELED;
- else if (error == EWOULDBLOCK) {
- /* A race occurred between a wakeup and a timeout. */
- if (p->p_futex == NULL)
- error = 0;
- else
- error = ETIMEDOUT;
+ futex_addrs(p, &f, uaddr, flags);
+ fsq = futex_get_slpque(&f);
+
+ /* Mark futex as waiting. */
+ f.ft_proc = p;
+ rw_enter_write(&fsq->fsq_lock);
+ /* Make the waiting futex visible to wake/requeue */
+ TAILQ_INSERT_TAIL(&fsq->fsq_list, &f, ft_entry);
+ rw_exit_write(&fsq->fsq_lock);
+
+ /*
+ * Do not return before f has been removed from the slpque!
+ */
+
+ /*
+ * Read user space futex value
+ */
+ if ((error = copyin32(uaddr, &cval)) != 0)
+ goto exit;
+
+ /* If the value changed, stop here. */
+ if (cval != val) {
+ error = EAGAIN;
+ goto exit;
}
+ sleep_setup(&f, PWAIT|PCATCH, "fsleep");
+ error = sleep_finish(to_ticks, f.ft_proc != NULL);
/* Remove ourself if we haven't been awaken. */
- if ((f = p->p_futex) != NULL) {
- p->p_futex = NULL;
- TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
- futex_put(f);
+ if (error != 0 || f.ft_proc != NULL) {
+ if (futex_unwait(fsq, &f) == 0)
+ error = 0;
+
+ switch (error) {
+ case ERESTART:
+ error = ECANCELED;
+ break;
+ case EWOULDBLOCK:
+ error = ETIMEDOUT;
+ break;
+ default:
+ break;
+ }
}
return error;
+exit:
+ if (f.ft_proc != NULL)
+ futex_unwait(fsq, &f);
+ return error;
+}
+
+static void
+futex_list_wakeup(struct futex_list *fl)
+{
+ struct futex *f, *nf;
+ struct proc *p;
+
+ /*
+ * Setting ft_proc to NULL releases the futex reference
+ * currently held via the slpque lock.
+ *
+ * SCHED_LOCK is only needed to call wakeup_proc.
+ */
+
+ SCHED_LOCK();
+ TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) {
+ p = f->ft_proc;
+ f->ft_proc = NULL;
+ wakeup_proc(p);
+ }
+ SCHED_UNLOCK();
}
/*
@@ -273,46 +353,133 @@ futex_wait(uint32_t *uaddr, uint32_t val
* ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
* address ``uaddr2''.
*/
-int
-futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m,
- int flags)
+static int
+futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n,
+ uint32_t *uaddr2, uint32_t m, int flags, register_t *retval)
{
- struct futex *f, *g;
- struct proc *p;
+ struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
+ struct futex okey, nkey;
+ struct futex *f, *nf, *mf = NULL;
+ struct futex_slpque *ofsq, *nfsq;
uint32_t count = 0;
- rw_assert_wrlock(&ftlock);
+ if (m == 0)
+ return futex_wake(p, uaddr, n, flags, retval);
- f = futex_get(uaddr, flags);
- if (f == NULL)
- return 0;
+ futex_addrs(p, &okey, uaddr, flags);
+ ofsq = futex_get_slpque(&okey);
+ futex_addrs(p, &nkey, uaddr2, flags);
+ nfsq = futex_get_slpque(&nkey);
+
+ if (ofsq->fsq_id < nfsq->fsq_id) {
+ rw_enter_write(&ofsq->fsq_lock);
+ rw_enter_write(&nfsq->fsq_lock);
+ } else if (ofsq->fsq_id > nfsq->fsq_id) {
+ rw_enter_write(&nfsq->fsq_lock);
+ rw_enter_write(&ofsq->fsq_lock);
+ } else
+ rw_enter_write(&ofsq->fsq_lock);
+
+ TAILQ_FOREACH_SAFE(f, &ofsq->fsq_list, ft_entry, nf) {
+ /* __builtin_prefetch(nf, 1); */
+ KASSERT(f->ft_proc != NULL);
- while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) {
- p->p_futex = NULL;
- TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
- futex_put(f);
-
- if (count < n) {
- wakeup_one(p);
- } else if (uaddr2 != NULL) {
- g = futex_get(uaddr2, FT_CREATE);
- TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link);
- p->p_futex = g;
+ if (!futex_is_eq(f, &okey))
+ continue;
+
+ TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
+ TAILQ_INSERT_TAIL(&fl, f, ft_entry);
+
+ if (++count == n) {
+ mf = nf;
+ break;
}
- count++;
}
- futex_put(f);
+ if (!TAILQ_EMPTY(&fl))
+ futex_list_wakeup(&fl);
- return count;
+ /* update matching futexes */
+ if (mf != NULL) {
+ /*
+ * only iterate from the current entry to the tail
+ * of the list as it is now in case we're requeueing
+ * on the end of the same list.
+ */
+ nf = TAILQ_LAST(&ofsq->fsq_list, futex_list);
+ do {
+ f = mf;
+ mf = TAILQ_NEXT(f, ft_entry);
+ /* __builtin_prefetch(mf, 1); */
+
+ KASSERT(f->ft_proc != NULL);
+
+ if (!futex_is_eq(f, &okey))
+ continue;
+
+ TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry);
+ f->ft_ps = nkey.ft_ps;
+ f->ft_obj = nkey.ft_obj;
+ f->ft_amap = nkey.ft_amap;
+ f->ft_off = nkey.ft_off;
+ TAILQ_INSERT_TAIL(&nfsq->fsq_list, f, ft_entry);
+
+ if (--m == 0)
+ break;
+ } while (f != nf);
+ }
+
+ if (ofsq->fsq_id != nfsq->fsq_id)
+ rw_exit_write(&nfsq->fsq_lock);
+ rw_exit_write(&ofsq->fsq_lock);
+
+ *retval = count;
+ return 0;
}
/*
* Wakeup at most ``n'' sibling threads sleeping on a futex at address
* ``uaddr''.
*/
-int
-futex_wake(uint32_t *uaddr, uint32_t n, int flags)
+static int
+futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags,
+ register_t *retval)
{
- return futex_requeue(uaddr, n, NULL, 0, flags);
+ struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl);
+ struct futex key;
+ struct futex *f, *nf;
+ struct futex_slpque *fsq;
+ int count = 0;
+
+ if (n == 0) {
+ *retval = 0;
+ return 0;
+ }
+
+ futex_addrs(p, &key, uaddr, flags);
+ fsq = futex_get_slpque(&key);
+
+ rw_enter_write(&fsq->fsq_lock);
+
+ TAILQ_FOREACH_SAFE(f, &fsq->fsq_list, ft_entry, nf) {
+ /* __builtin_prefetch(nf, 1); */
+ KASSERT(f->ft_proc != NULL);
+
+ if (!futex_is_eq(f, &key))
+ continue;
+
+ TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry);
+ TAILQ_INSERT_TAIL(&fl, f, ft_entry);
+
+ if (++count == n)
+ break;
+ }
+
+ if (!TAILQ_EMPTY(&fl))
+ futex_list_wakeup(&fl);
+
+ rw_exit_write(&fsq->fsq_lock);
+
+ *retval = count;
+ return 0;
}
avoid lock contention in futex syscalls