From: David Gwynne Subject: Re: mbuf cluster m_extref_mtx contention To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 22 Apr 2026 16:21:42 +1000 On Mon, Mar 30, 2026 at 01:56:38PM +0200, Alexander Bluhm wrote: > On Tue, Mar 17, 2026 at 02:24:01PM +0100, Alexander Bluhm wrote: > > On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote: > > > this is the proxy ref thing if you want to give it a go. the code is a > > > lot simpler at least. > > > > I have tested both variants of the diff again and with this one I > > got a panic. It was the sixth repetition of my test runs, this > > time with btrace profiling turned on. > > It is reproduceable and it is triggered by btrace. i havent spent as much time on this as i would have liked because i havent been able to reproduce this, but there's a chance that this was related to the fix committed in src/sys/net/pf_norm.c r1.237 for the pf frag rb tree topology. i also found a bug i introduced in m_split, but i havent figured out if it could cause a crash like this. that bug is fixed in the diff below. however, i did go through and audit the callers of mbuf functions that create shared cluster refs, and they all appear to be doing the right thing and propagate failure. i expected this though, cos the affected functions can fail because of mbuf allocation issues anyway, so the callers would already have to be robust. > *cpu1: uvm_fault(0xffffffff82bb0cb8, 0x2414c5fe0, 0, 1) -> e > > *58677 355342 0 0 7 0x14200 softnet0 > > ddb{1}> trace > x86_ipi_db(ffff80005a5eaff0) at x86_ipi_db+0x16 > x86_ipi_handler() at x86_ipi_handler+0x80 > Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27 > x86_bus_space_io_write_1(3f8,0,20) at x86_bus_space_io_write_1+0x1d > comcnputc(800,20) at comcnputc+0xdc > cnputc(20) at cnputc+0x47 > db_putchar(70) at db_putchar+0x42a > kprintf() at kprintf+0x12ff > db_printf(ffffffff82662513) at db_printf+0x6d > db_ktrap(6,0,ffff8000632447a0) at db_ktrap+0x205 > kerntrap(ffff8000632447a0) at kerntrap+0xe2 > alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b > refcnt_rele(fffffd8f371b6d50) at refcnt_rele+0x78 > m_extfree_refs(fffffd8003fc7540,840,fffffd8f371b6d40) at m_extfree_refs+0x31 > m_free(fffffd808accec00) at m_free+0x170 > m_freem(fffffd8089550e00) at m_freem+0x38 > ip6_forward(fffffd8089d77900,0,2) at ip6_forward+0x9bf > pf_refragment6(ffff800063244c00,fffffd8f080a7010,0,0,0) at pf_refragment6+0x19a > pf_test(18,3,ffff800000393048,ffff800063244e48) at pf_test+0x83f > ip6_forward(fffffd8089550e00,ffffffff82b77608,2) at ip6_forward+0x3ce > ip6_input_if(ffff800063244fd8,ffff800063244fe4,29,0,ffff800000279048,ffffffff82b775d8) at ip6_input_if+0x87d > ipv6_input(ffff800000279048,fffffd808accec00,ffffffff82b775d8) at ipv6_input+0x42 > if_input_process(ffff800000279048,ffff800063245098,0) at if_input_process+0x20a > ifiq_process(ffff800000279480) at ifiq_process+0xa1 > taskq_thread(ffff80000002c000) at taskq_thread+0x129 > end trace frame: 0x0, count: -25 > > ddb{1}> x/i refcnt_rele+0x78 > refcnt_rele+0x78: movq 0(%rsi,%rcx,8),%rsi > > /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:933 > 1cc3: 48 63 4f 04 movslq 0x4(%rdi),%rcx > 1cc7: 48 85 c9 test %rcx,%rcx > 1cca: 7e c7 jle 1c93 > 1ccc: 48 8b 35 00 00 00 00 mov 0(%rip),%rsi # 1cd3 > 1cd3: 48 85 f6 test %rsi,%rsi > 1cd6: 74 bb je 1c93 > 1cd8: 48 8b 34 ce mov (%rsi,%rcx,8),%rsi > * 1cdc: 83 7e 2c 00 cmpl $0x0,0x2c(%rsi) > 1ce0: 74 b1 je 1c93 > 1ce2: 48 8b 7e 10 mov 0x10(%rsi),%rdi > 1ce6: 4c 8b 5f 18 mov 0x18(%rdi),%r11 > 1cea: 89 c1 mov %eax,%ecx > 1cec: 41 b8 ff ff ff ff mov $0xffffffff,%r8d > 1cf2: 41 89 c6 mov %eax,%r14d > 1cf5: 31 c0 xor %eax,%eax > 1cf7: e8 00 00 00 00 callq 1cfc > 1cfc: 48 c7 44 24 f8 00 00 movq $0x0,0xfffffffffffffff8(%rsp) > 1d03: 00 00 > 1d05: 44 89 f0 mov %r14d,%eax > 1d08: eb 89 jmp 1c93 > /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:932 > > 925 int > 926 refcnt_rele(struct refcnt *r) > 927 { > 928 u_int refs; > 929 > 930 membar_exit_before_atomic(); > 931 refs = atomic_dec_int_nv(&r->r_refs); > 932 KASSERT(refs != ~0); > * 933 TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1); > 934 if (refs == 0) { > 935 membar_enter_after_atomic(); > 936 return (1); > 937 } > 938 return (0); > 939 } > > 330 #define DT_INDEX_ENTER(func, index, args...) do { > 331 extern struct dt_probe **_DT_INDEX_P(func); > 332 > 333 if (__predict_false(dt_tracing) && > 334 __predict_false(index > 0) && > 335 __predict_true(_DT_INDEX_P(func) != NULL)) { > * 336 struct dt_probe *dtp = _DT_INDEX_P(func)[index]; > 337 > 338 if(__predict_false(dtp->dtp_recording)) { > 339 struct dt_provider *dtpv = dtp->dtp_prov; > 340 > 341 dtpv->dtpv_enter(dtpv, dtp, args); > 342 } > 343 } > 344 } while (0) > > r->r_traceidx is invalid so it crashed when dt(4) uses it. > > I think this is not a btrace problem, but a use after free. btrace > just makes it visible. When refcnt_rele() is run, refcnt r must > be valid memory. > > I still prefer the other refcounting diff. It works fine for me > and maybe a bit faster. from another perspective, if the use after free is the result mishandling mbufs, then the other diff is more forgiving of a buggy behaviour. Index: kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v diff -u -p -r1.304 uipc_mbuf.c --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 +++ kern/uipc_mbuf.c 22 Apr 2026 06:04:19 -0000 @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); +struct m_ext_refs { + void *arg; + u_int free_fn; + u_int zero; + struct refcnt refs; +}; + +static struct pool m_ext_refs_pool; + +static void m_extfree_refs(caddr_t, u_int, void *); +u_int m_extfree_refs_fn; + +static int m_extref(struct mbuf *, struct mbuf *, int); +static void m_extfree(struct mbuf *); -void m_extfree(struct mbuf *); void m_zero(struct mbuf *); unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ @@ -174,6 +186,8 @@ mbinit(void) pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, IPL_NET, 0, "mtagpl", NULL); + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE, + IPL_NET, 0, "mextrefs", NULL); for (i = 0; i < nitems(mclsizes); i++) { lowbits = mclsizes[i] & ((1 << 10) - 1); @@ -193,6 +207,7 @@ mbinit(void) (void)mextfree_register(m_extfree_pool); KASSERT(num_extfree_fns == 1); + m_extfree_refs_fn = mextfree_register(m_extfree_refs); } void @@ -204,6 +219,7 @@ mbcpuinit(void) pool_cache_init(&mbpool); pool_cache_init(&mtagpool); + pool_cache_init(&m_ext_refs_pool); for (i = 0; i < nitems(mclsizes); i++) pool_cache_init(&mclpools[i]); @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, pool_put(pp, buf); } +int +m_ext_refs_shared(struct mbuf *m) +{ + struct m_ext_refs *mrefs = m->m_ext.ext_arg; + + return (refcnt_shared(&mrefs->refs)); +} + +static void +m_extfree_refs(caddr_t buf, u_int size, void *arg) +{ + struct m_ext_refs *mrefs = arg; + + if (refcnt_rele(&mrefs->refs)) { + if (mrefs->zero) + explicit_bzero(buf, size); + + KASSERT(mrefs->free_fn < num_extfree_fns); + KASSERT(mrefs->free_fn != m_extfree_refs_fn); + + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg); + + pool_put(&m_ext_refs_pool, mrefs); + } +} + struct mbuf * m_free(struct mbuf *m) { @@ -434,44 +476,33 @@ m_free(struct mbuf *m) return (n); } -void -m_extref(struct mbuf *o, struct mbuf *n) +static int +m_extref(struct mbuf *m, struct mbuf *n, int how) { - int refs = MCLISREFERENCED(o); + struct m_ext_refs *mrefs; - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); + if (m->m_ext.ext_free_fn == m_extfree_refs_fn) + mrefs = m->m_ext.ext_arg; + else { + mrefs = pool_get(&m_ext_refs_pool, how); + if (mrefs == NULL) + return (ENOMEM); - if (refs) - mtx_enter(&m_extref_mtx); - n->m_ext.ext_nextref = o->m_ext.ext_nextref; - n->m_ext.ext_prevref = o; - o->m_ext.ext_nextref = n; - n->m_ext.ext_nextref->m_ext.ext_prevref = n; - if (refs) - mtx_leave(&m_extref_mtx); - - MCLREFDEBUGN((n), __FILE__, __LINE__); -} + refcnt_init(&mrefs->refs); + mrefs->arg = m->m_ext.ext_arg; + mrefs->free_fn = m->m_ext.ext_free_fn; + mrefs->zero = 0; -static inline u_int -m_extunref(struct mbuf *m) -{ - int refs = 0; + m->m_ext.ext_arg = mrefs; + m->m_ext.ext_free_fn = m_extfree_refs_fn; + } - if (!MCLISREFERENCED(m)) - return (0); + refcnt_take(&mrefs->refs); - mtx_enter(&m_extref_mtx); - if (MCLISREFERENCED(m)) { - m->m_ext.ext_nextref->m_ext.ext_prevref = - m->m_ext.ext_prevref; - m->m_ext.ext_prevref->m_ext.ext_nextref = - m->m_ext.ext_nextref; - refs = 1; - } - mtx_leave(&m_extref_mtx); + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size, + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs); - return (refs); + return (0); } /* @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_ return num_extfree_fns++; } -void +static void m_extfree(struct mbuf *m) { - if (m_extunref(m) == 0) { - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, - m->m_ext.ext_size, m->m_ext.ext_arg); - } - + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, + m->m_ext.ext_size, m->m_ext.ext_arg); + m->m_flags &= ~(M_EXT|M_EXTWR); } @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { + if (m_extref(m, n, wait) != 0) + goto nospace; n->m_data = m->m_data + off; - n->m_ext = m->m_ext; - MCLADDREFERENCE(m, n); } else { n->m_data += m->m_data - (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); @@ -1089,8 +1118,12 @@ m_split(struct mbuf *m0, int len0, int w return (NULL); } if (m->m_flags & M_EXT) { - n->m_ext = m->m_ext; - MCLADDREFERENCE(m, n); + if (m_extref(m, n, wait) != 0) { + m_freem(n); + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len = olen; + return (NULL); + } n->m_data = m->m_data + len; } else { m_align(n, remain); @@ -1271,13 +1304,17 @@ m_devget(char *buf, int totlen, int off) void m_zero(struct mbuf *m) { - if (M_READONLY(m)) { - mtx_enter(&m_extref_mtx); - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; - } - mtx_leave(&m_extref_mtx); + if (ISSET(m->m_flags, M_EXT) && + m->m_ext.ext_free_fn == m_extfree_refs_fn) { + struct m_ext_refs *mrefs = m->m_ext.ext_arg; + + /* + * this variable only transitions in one direction, + * so if there is a race it will be toward the same + * result and therefore there is no loss. + */ + + mrefs->zero = 1; return; } @@ -1525,9 +1562,7 @@ m_print(void *v, m->m_ext.ext_buf, m->m_ext.ext_size); (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", m->m_ext.ext_free_fn, m->m_ext.ext_arg); - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", - m->m_ext.ext_nextref, m->m_ext.ext_prevref); - + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */ } } Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v diff -u -p -r1.269 mbuf.h --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 +++ sys/mbuf.h 22 Apr 2026 06:04:19 -0000 @@ -145,8 +145,6 @@ struct mbuf_ext { void *ext_arg; u_int ext_free_fn; /* index of free function */ u_int ext_size; /* size of buffer, for ext_free_fn */ - struct mbuf *ext_nextref; - struct mbuf *ext_prevref; #ifdef DEBUG const char *ext_ofile; const char *ext_nfile; @@ -282,13 +280,22 @@ struct mbuf { #define MCLREFDEBUGO(m, file, line) #endif -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) +int m_ext_refs_shared(struct mbuf *); -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) +static inline int +m_extreferenced(struct mbuf *m) +{ + extern u_int m_extfree_refs_fn; + + if (m->m_ext.ext_free_fn != m_extfree_refs_fn) + return (0); + + return (m_ext_refs_shared(m)); +} + +#define MCLISREFERENCED(m) m_extreferenced(m) #define MCLINITREFERENCE(m) do { \ - (m)->m_ext.ext_prevref = (m); \ - (m)->m_ext.ext_nextref = (m); \ MCLREFDEBUGO((m), __FILE__, __LINE__); \ MCLREFDEBUGN((m), NULL, 0); \ } while (/* CONSTCOND */ 0) @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *); int m_trailingspace(struct mbuf *); void m_align(struct mbuf *, int); struct mbuf *m_clget(struct mbuf *, int, u_int); -void m_extref(struct mbuf *, struct mbuf *); void m_pool_init(struct pool *, u_int, u_int, const char *); u_int m_pool_used(void); void m_extfree_pool(caddr_t, u_int, void *);