From: Alexander Bluhm Subject: dt(4) btrace rwlock To: tech@openbsd.org Date: Fri, 18 Jul 2025 17:15:42 +0200 Hi, I would like to introduce dt(4) trace points into rwlock like we have them in refcnt. A first example would be to find exclusive netlock calls that are used in the kernel. /* * netlock: stack traces where exclusive net lock is taken * - probe: netlock, solock * - arg0: pointer to struct rwlock * - arg1: lock type * - 1: read lock * - 2: write lock * - arg2: success status * - 1: locked immediately * - 2: locked after spinning * - 3: locked after sleeping * - 4: locking failed */ tracepoint:rwlock:netlock { if (arg1 == 2) { printf("%s %x %u %d%s", probe, arg0, arg1, arg2, kstack); } } The printed output looks like this: tracepoint:rwlock:netlock ffffffff82a34540 2 1 rw_do_enter_write+0x2d8 igmp_slowtimo+0x14 pfslowtimo+0x107 softclock_thread_run+0xc8 0xffffffff822b9495 proc_trampoline+0x10 kernel tracepoint:rwlock:netlock ffffffff82a34540 2 1 rw_do_enter_write+0x2d8 icmp_sysctl+0x8a net_sysctl+0x27b sys_sysctl+0x1c5 syscall+0x5f9 Xsyscall+0x128 kernel tracepoint:rwlock:netlock ffffffff82a34540 2 1 rw_do_enter_write+0x2d8 ip6_sysctl+0x39a net_sysctl+0x27b sys_sysctl+0x1c5 syscall+0x5f9 Xsyscall+0x128 kernel The second example finds contention of the socket lock. /* * solock: stack sampling of socket lock contension * - probe: netlock, solock * - arg0: pointer to struct rwlock * - arg1: lock type * - 1: read lock * - 2: write lock * - arg2: success status * - 1: locked immediately * - 2: locked after spinning * - 3: locked after sleeping * - 4: locking failed * * To produce a FlameGraph process the output with stackcollapse-bpftrace.pl * and flamegraph.pl found in: * https://github.com/brendangregg/FlameGraph */ tracepoint:rwlock:solock { if (arg2 >= 2) { @[kstack] = count(); } } END { print(@); } This script allows to create a flame graph: http://bluhm.genua.de/files/ot48-solock.svg Diff below adds trace points for netlock and solock, others can follow later. The initializer is called with _trace suffix. ok? bluhm Index: dev/dt/dt_prov_static.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/dev/dt/dt_prov_static.c,v diff -u -p -r1.25 dt_prov_static.c --- dev/dt/dt_prov_static.c 10 Mar 2025 09:28:56 -0000 1.25 +++ dev/dt/dt_prov_static.c 17 Jul 2025 13:32:27 -0000 @@ -107,6 +107,13 @@ DT_STATIC_PROBE3(refcnt, syncache, "void DT_STATIC_PROBE3(refcnt, tdb, "void *", "int", "int"); /* + * read write sleeping locks, keep in sync with sys/rwlock.h + */ +DT_STATIC_PROBE0(rwlock, none); +DT_STATIC_PROBE3(rwlock, netlock, "void *", "int", "int"); +DT_STATIC_PROBE3(rwlock, solock, "void *", "int", "int"); + +/* * List of all static probes */ struct dt_probe *const dtps_static[] = { @@ -161,9 +168,14 @@ struct dt_probe *const dtps_static[] = { &_DT_STATIC_P(refcnt, socket), &_DT_STATIC_P(refcnt, syncache), &_DT_STATIC_P(refcnt, tdb), + /* rwlock */ + &_DT_STATIC_P(rwlock, none), + &_DT_STATIC_P(rwlock, netlock), + &_DT_STATIC_P(rwlock, solock), }; struct dt_probe *const *dtps_index_refcnt; +struct dt_probe *const *dtps_index_rwlock; int dt_prov_static_init(void) @@ -173,6 +185,8 @@ dt_prov_static_init(void) for (i = 0; i < nitems(dtps_static); i++) { if (dtps_static[i] == &_DT_STATIC_P(refcnt, none)) dtps_index_refcnt = &dtps_static[i]; + if (dtps_static[i] == &_DT_STATIC_P(rwlock, none)) + dtps_index_rwlock = &dtps_static[i]; dt_dev_register_probe(dtps_static[i]); } Index: kern/kern_rwlock.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.57 kern_rwlock.c --- kern/kern_rwlock.c 3 Jun 2025 00:20:31 -0000 1.57 +++ kern/kern_rwlock.c 17 Jul 2025 13:32:27 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include #ifdef RWDIAG @@ -162,12 +163,13 @@ rw_exit_write(struct rwlock *rwl) static void _rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, - const struct lock_type *type) + const struct lock_type *type, int trace) { rwl->rwl_owner = 0; rwl->rwl_waiters = 0; rwl->rwl_readers = 0; rwl->rwl_name = name; + rwl->rwl_traceidx = trace; #ifdef WITNESS rwl->rwl_lock_obj.lo_flags = lo_flags; @@ -182,9 +184,9 @@ _rw_init_flags_witness(struct rwlock *rw void _rw_init_flags(struct rwlock *rwl, const char *name, int flags, - const struct lock_type *type) + const struct lock_type *type, int trace) { - _rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type); + _rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type, trace); } int @@ -235,6 +237,7 @@ rw_do_enter_write(struct rwlock *rwl, in owner = rw_cas(&rwl->rwl_owner, 0, self); if (owner == 0) { /* wow, we won. so easy */ + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 2, 1); goto locked; } if (__predict_false(owner == self)) { @@ -268,6 +271,8 @@ rw_do_enter_write(struct rwlock *rwl, in if (owner == 0) { spc->spc_spinning--; /* ok, we won now. */ + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 2, + 2); goto locked; } } @@ -275,8 +280,10 @@ rw_do_enter_write(struct rwlock *rwl, in } #endif - if (ISSET(flags, RW_NOSLEEP)) + if (ISSET(flags, RW_NOSLEEP)) { + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 2, 4); return (EBUSY); + } prio = PLOCK - 4; if (ISSET(flags, RW_INTR)) @@ -299,12 +306,14 @@ rw_do_enter_write(struct rwlock *rwl, in #endif if (ISSET(flags, RW_INTR) && (error != 0)) { rw_dec(&rwl->rwl_waiters); + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 2, 4); return (error); } owner = rw_cas(&rwl->rwl_owner, 0, self); } while (owner != 0); rw_dec(&rwl->rwl_waiters); + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 2, 3); locked: membar_enter_after_atomic(); @@ -349,6 +358,7 @@ rw_do_enter_read(struct rwlock *rwl, int owner = rw_cas(&rwl->rwl_owner, 0, RWLOCK_READ_INCR); if (owner == 0) { /* ermagerd, we won! */ + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 1, 1); goto locked; } @@ -360,12 +370,15 @@ rw_do_enter_read(struct rwlock *rwl, int } else if (atomic_load_int(&rwl->rwl_waiters) == 0) { if (rw_read_incr(rwl, owner)) { /* nailed it */ + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 1, 2); goto locked; } } - if (ISSET(flags, RW_NOSLEEP)) + if (ISSET(flags, RW_NOSLEEP)) { + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 1, 4); return (EBUSY); + } prio = PLOCK; if (ISSET(flags, RW_INTR)) @@ -388,10 +401,12 @@ rw_do_enter_read(struct rwlock *rwl, int #endif if (ISSET(flags, RW_INTR) && (error != 0)) { rw_dec(&rwl->rwl_readers); + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 1, 4); return (error); } } while (!rw_read_incr(rwl, 0)); rw_dec(&rwl->rwl_readers); + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 1, 3); locked: membar_enter_after_atomic(); @@ -451,8 +466,10 @@ rw_upgrade(struct rwlock *rwl, int flags owner, self); } + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 3, 4); return (EBUSY); } + TRACEINDEX(rwlock, rwl->rwl_traceidx, rwl, 3, 1); #ifdef WITNESS { @@ -583,7 +600,7 @@ _rrw_init_flags(struct rrwlock *rrwl, co { memset(rrwl, 0, sizeof(struct rrwlock)); _rw_init_flags_witness(&rrwl->rrwl_lock, name, RRWLOCK_LO_FLAGS(flags), - type); + type, 0); } int @@ -696,7 +713,7 @@ _rw_obj_alloc_flags(struct rwlock **lock mo = pool_get(&rwlock_obj_pool, PR_WAITOK); mo->ro_magic = RWLOCK_OBJ_MAGIC; - _rw_init_flags(&mo->ro_lock, name, flags, type); + _rw_init_flags(&mo->ro_lock, name, flags, type, 0); mo->ro_refcnt = 1; *lock = &mo->ro_lock; Index: kern/subr_pool.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/subr_pool.c,v diff -u -p -r1.240 subr_pool.c --- kern/subr_pool.c 12 Jun 2025 20:37:58 -0000 1.240 +++ kern/subr_pool.c 17 Jul 2025 13:32:27 -0000 @@ -2244,7 +2244,7 @@ void pool_lock_rw_init(struct pool *pp, union pool_lock *lock, const struct lock_type *type) { - _rw_init_flags(&lock->prl_rwlock, pp->pr_wchan, 0, type); + _rw_init_flags(&lock->prl_rwlock, pp->pr_wchan, 0, type, 0); } void Index: kern/uipc_socket.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v diff -u -p -r1.381 uipc_socket.c --- kern/uipc_socket.c 15 Jul 2025 12:52:52 -0000 1.381 +++ kern/uipc_socket.c 17 Jul 2025 13:32:27 -0000 @@ -158,7 +158,8 @@ soalloc(const struct protosw *prp, int w #endif refcnt_init_trace(&so->so_refcnt, DT_REFCNT_IDX_SOCKET); - rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK); + rw_init_flags_trace(&so->so_lock, dom_name, RWL_DUPOK, + DT_RWLOCK_IDX_SOLOCK); rw_init(&so->so_rcv.sb_lock, "sbufrcv"); rw_init(&so->so_snd.sb_lock, "sbufsnd"); mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); Index: net/if.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v diff -u -p -r1.737 if.c --- net/if.c 7 Jul 2025 02:28:50 -0000 1.737 +++ net/if.c 17 Jul 2025 13:32:27 -0000 @@ -248,7 +248,8 @@ struct task if_input_task_locked = TASK_ * Serialize socket operations to ensure no new sleeping points * are introduced in IP output paths. */ -struct rwlock netlock = RWLOCK_INITIALIZER("netlock"); +struct rwlock netlock = RWLOCK_INITIALIZER_TRACE("netlock", + DT_RWLOCK_IDX_NETLOCK); /* * Network interface utility routines. Index: sys/rwlock.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/sys/rwlock.h,v diff -u -p -r1.33 rwlock.h --- sys/rwlock.h 18 May 2025 00:13:57 -0000 1.33 +++ sys/rwlock.h 17 Jul 2025 13:32:27 -0000 @@ -66,6 +66,7 @@ struct rwlock { #ifdef WITNESS struct lock_object rwl_lock_obj; #endif + int rwl_traceidx; }; #define RWLOCK_LO_FLAGS(flags) \ @@ -93,10 +94,14 @@ struct rwlock { #ifdef WITNESS #define RWLOCK_INITIALIZER(name) \ - { 0, 0, 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } + { 0, 0, 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0), 0 } +#define RWLOCK_INITIALIZER_TRACE(name, trace) \ + { 0, 0, 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0), trace } #else #define RWLOCK_INITIALIZER(name) \ - { 0, 0, 0, name } + { 0, 0, 0, name, 0 } +#define RWLOCK_INITIALIZER_TRACE(name, trace) \ + { 0, 0, 0, name, trace } #endif #define RWLOCK_WRLOCK 0x04UL @@ -133,18 +138,24 @@ struct rrwlock { #ifdef _KERNEL void _rw_init_flags(struct rwlock *, const char *, int, - const struct lock_type *); + const struct lock_type *, int); #ifdef WITNESS +#define rw_init_flags_trace(rwl, name, flags, trace) do { \ + static const struct lock_type __lock_type = { .lt_name = #rwl };\ + _rw_init_flags(rwl, name, flags, &__lock_type, trace); \ +} while (0) #define rw_init_flags(rwl, name, flags) do { \ static const struct lock_type __lock_type = { .lt_name = #rwl };\ - _rw_init_flags(rwl, name, flags, &__lock_type); \ + _rw_init_flags(rwl, name, flags, &__lock_type, 0); \ } while (0) -#define rw_init(rwl, name) rw_init_flags(rwl, name, 0) +#define rw_init(rwl, name) rw_init_flags(rwl, name, 0) #else /* WITNESS */ +#define rw_init_flags_trace(rwl, name, flags, trace) \ + _rw_init_flags(rwl, name, flags, NULL, trace) #define rw_init_flags(rwl, name, flags) \ - _rw_init_flags(rwl, name, flags, NULL) -#define rw_init(rwl, name) _rw_init_flags(rwl, name, 0, NULL) + _rw_init_flags(rwl, name, flags, NULL, 0) +#define rw_init(rwl, name) _rw_init_flags(rwl, name, 0, NULL, 0) #endif /* WITNESS */ void rw_enter_read(struct rwlock *); @@ -230,6 +241,10 @@ void _rw_obj_alloc_flags(struct rwlock * struct lock_type *); void rw_obj_hold(struct rwlock *); int rw_obj_free(struct rwlock *); + +/* sorted alphabetically, keep in sync with dev/dt/dt_prov_static.c */ +#define DT_RWLOCK_IDX_NETLOCK 1 +#define DT_RWLOCK_IDX_SOLOCK 2 #endif /* _KERNEL */