From: Alexander Bluhm Subject: Re: mbuf cluster m_extref_mtx contention To: David Gwynne Cc: tech@openbsd.org Date: Tue, 17 Mar 2026 14:24:01 +0100 On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote: > this is the proxy ref thing if you want to give it a go. the code is a > lot simpler at least. I have tested both variants of the diff again and with this one I got a panic. It was the sixth repetition of my test runs, this time with btrace profiling turned on. [-- MARK -- Mon Mar 16 23:30:00 2026] smr: dispatch took 8.999999s [-- MARK -- Mon Mar 16 23:35:00 2026] [-- MARK -- Mon Mar 16 23:40:00 2026] [-- MARK -- Mon Mar 16 23:45:00 2026] [-- MARK -- Mon Mar 16 23:50:00 2026] WWuuvARAvmNR_NImfIuuvNa_vNmumGl_f: tGaS(:u_P0flLfatxa u(Sl0uPtx fflL(fN 0ftNxfOOfffT(Tfff fLf0ffOfdWfxf9E8R2f fEbfLff0D4ppppppppp4 paaaaaapaa9On9NapOnnnanniiinii 9cSdnaWcccin0:ic:YEanni::c c: S0ci Rnii: C0:c: Eic c A, : L 0 : Lx 1 1e 39 33 7 31 3 k D :c Eb O X8 e , r N: I 0 , n e T l dSY 0 iS a g 9 n StoCALppeL 1d a3t3 3 EsaXvIeTc 0 tx9+ 0xae: movl $0,%gs:0x688 TID PID UID PRFLAGS PFLAGS CPU COMMAND 143474 68367 0 0x2 0x80 14 btrace 89473 63102 0 0x100000 0 13 udpbench 1403 76119 0 0x100000 0 5 udpbench 386603 96549 0 0x100000 0 8 udpbench *324301 80498 0 0x100000 0 12 udpbench 320043 65374 0 0x100000 0 2 udpbench 38951 54248 0 0x100000 0 9 udpbench 168506 2350 0 0x100000 0 6 udpbench 52911 10331 0 0x100000 0 3 udpbench 313170 79129 0 0x100000 0 11 udpbench 151831 75539 0 0x100000 0 10 udpbench 375649 58184 0 0x14000 0x200 4 softnet7 402131 25075 0 0x14000 0x200 1 softnet2 418053 24476 0 0x14000 0x200 7 softnet1 savectx() at savectx+0xae end of kernel end trace frame: 0x75df8c7afde0, count: 14 https://www.openbsd.org/ddb.html describes the minimum info required in bug reports. Insufficient info makes it difficult to find and fix bugs. ddb{12}> ddb{12}> show panic cpu29: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so ftintr.c", line 71 cpu22: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so ftintr.c", line 71 cpu20: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so ftintr.c", line 71 cpu13: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 1) -> e cpu11: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so ftintr.c", line 71 cpu10: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so ftintr.c", line 71 cpu9: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof tintr.c", line 71 cpu8: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e cpu7: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof tintr.c", line 71 cpu6: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e cpu4: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof tintr.c", line 71 cpu3: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof tintr.c", line 71 *cpu2: uvm_fault(0xfffffd904999d000, 0x1e93713b8, 0, 1) -> e ddb{2}> trace x86_ipi_db(ffff80005a5f3ff0) at x86_ipi_db+0x16 x86_ipi_handler() at x86_ipi_handler+0x80 Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27 x86_bus_space_io_read_1(3f8,5) at x86_bus_space_io_read_1+0x19 comcnputc(800,20) at comcnputc+0x13f cnputc(20) at cnputc+0x47 db_putchar(31) at db_putchar+0x3ea kprintf() at kprintf+0x12ff db_printf(ffffffff826121a5) at db_printf+0x6d fault(ffffffff826cf673) at fault+0x97 kpageflttrap(ffff8000645271c0,1e93713b8) at kpageflttrap+0x1aa kerntrap(ffff8000645271c0) at kerntrap+0xcf alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b refcnt_rele(fffffd8f07072650) at refcnt_rele+0x78 m_extfree_refs(fffffd800f37b800,800,fffffd8f07072640) at m_extfree_refs+0x31 m_free(fffffd8089ce3800) at m_free+0x170 m_freem(fffffd8073abf000) at m_freem+0x38 ifq_enqueue(ffff800002c78c00,fffffd8073abf000) at ifq_enqueue+0x98 if_enqueue_ifq(ffff800000393048,fffffd8073abf000) at if_enqueue_ifq+0x66 ether_output(ffff800000393048,fffffd8073abf000,fffffd8f05720460,0) at ether_output+0x9d if_output_ml(ffff800000393048,ffff8000645274f8,fffffd8f05720460,0) at if_output_ml+0x5e ip6_output(fffffd8082a49600,0,fffffd8f05720448,0,ffff800004f002b0,fffffd8f057204e0) at ip6_output+0x10d4 udp6_output(fffffd8f057203d8,fffffd8082a49600,0,0) at udp6_output+0x2ba sosend(ffff800004f226a8,0,ffff800064527860,0,0,0) at sosend+0x432 sendit(ffff8000634f1298,3,ffff800064527948,0,ffff8000645279e0) at sendit+0x39a sys_sendto(ffff8000634f1298,ffff800064527a60,ffff8000645279e0) at sys_sendto+0x6c syscall(ffff800064527a60) at syscall+0x5f9 Xsyscall() at Xsyscall+0x128 end of kernel end trace frame: 0x75df8c7afde0, count: -28 ddb{2}> show register rdi 0xffff80005a5f3ff0 rsi 0 rbp 0xffff800064526d40 rbx 0xffffffff82ac8418 ipifunc+0x38 rdx 0 rcx 0x7 rax 0xffffff7f r8 0 r9 0 r10 0 r11 0x8c8668458d7cd5f3 r12 0x7 r13 0 r14 0xffff80005a5f3ff0 r15 0 rip 0xffffffff81680b46 x86_ipi_db+0x16 cs 0x8 rflags 0x286 rsp 0xffff800064526d30 ss 0x10 x86_ipi_db+0x16: leave ddb{2}> show mbuf 0xfffffd8073abf000 mbuf 0xfffffd8073abf000 m_type: 32159 m_flags: 6b6c m_next: 0xfffffd8074560100 m_nextpkt: 0x80001ad m_data: 0x429402116efa7f87 m_len: 2149814396 m_dat: 0xfffffd8073abf020 m_pktdat: 0xfffffd8073abf070 ddb{2}> show mbuf 0xfffffd8089ce3800 mbuf 0xfffffd8089ce3800 m_type: 1 m_flags: 9 m_next: 0x0 m_nextpkt: 0x0 m_data: 0xfffffd800f37bd44 m_len: 228 m_dat: 0xfffffd8089ce3820 m_pktdat: 0xfffffd8089ce3870 m_ext.ext_buf: 0xfffffd800f37b800 m_ext.ext_size: 2048 m_ext.ext_free_fn: 1 m_ext.ext_arg: 0xfffffd8f07072640 Not sure what is going on. I continue testing and try to reproduce. bluhm > Index: kern/uipc_mbuf.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v > diff -u -p -r1.304 uipc_mbuf.c > --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 > +++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000 > @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level > int max_protohdr; /* largest protocol header */ > int max_hdr; /* largest link+protocol header */ > > -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); > +struct m_ext_refs { > + void *arg; > + u_int free_fn; > + u_int zero; > + struct refcnt refs; > +}; > + > +static struct pool m_ext_refs_pool; > + > +static void m_extfree_refs(caddr_t, u_int, void *); > +u_int m_extfree_refs_fn; > + > +static int m_extref(struct mbuf *, struct mbuf *, int); > +static void m_extfree(struct mbuf *); > > -void m_extfree(struct mbuf *); > void m_zero(struct mbuf *); > > unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ > @@ -174,6 +186,8 @@ mbinit(void) > > pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, > IPL_NET, 0, "mtagpl", NULL); > + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE, > + IPL_NET, 0, "mextrefs", NULL); > > for (i = 0; i < nitems(mclsizes); i++) { > lowbits = mclsizes[i] & ((1 << 10) - 1); > @@ -193,6 +207,7 @@ mbinit(void) > > (void)mextfree_register(m_extfree_pool); > KASSERT(num_extfree_fns == 1); > + m_extfree_refs_fn = mextfree_register(m_extfree_refs); > } > > void > @@ -204,6 +219,7 @@ mbcpuinit(void) > > pool_cache_init(&mbpool); > pool_cache_init(&mtagpool); > + pool_cache_init(&m_ext_refs_pool); > > for (i = 0; i < nitems(mclsizes); i++) > pool_cache_init(&mclpools[i]); > @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, > pool_put(pp, buf); > } > > +int > +m_ext_refs_shared(struct mbuf *m) > +{ > + struct m_ext_refs *mrefs = m->m_ext.ext_arg; > + > + return (refcnt_shared(&mrefs->refs)); > +} > + > +static void > +m_extfree_refs(caddr_t buf, u_int size, void *arg) > +{ > + struct m_ext_refs *mrefs = arg; > + > + if (refcnt_rele(&mrefs->refs)) { > + if (mrefs->zero) > + explicit_bzero(buf, size); > + > + KASSERT(mrefs->free_fn < num_extfree_fns); > + KASSERT(mrefs->free_fn != m_extfree_refs_fn); > + > + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg); > + > + pool_put(&m_ext_refs_pool, mrefs); > + } > +} > + > struct mbuf * > m_free(struct mbuf *m) > { > @@ -434,44 +476,33 @@ m_free(struct mbuf *m) > return (n); > } > > -void > -m_extref(struct mbuf *o, struct mbuf *n) > +static int > +m_extref(struct mbuf *m, struct mbuf *n, int how) > { > - int refs = MCLISREFERENCED(o); > - > - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); > + struct m_ext_refs *mrefs; > > - if (refs) > - mtx_enter(&m_extref_mtx); > - n->m_ext.ext_nextref = o->m_ext.ext_nextref; > - n->m_ext.ext_prevref = o; > - o->m_ext.ext_nextref = n; > - n->m_ext.ext_nextref->m_ext.ext_prevref = n; > - if (refs) > - mtx_leave(&m_extref_mtx); > + if (m->m_ext.ext_free_fn == m_extfree_refs_fn) > + mrefs = m->m_ext.ext_arg; > + else { > + mrefs = pool_get(&m_ext_refs_pool, how); > + if (mrefs == NULL) > + return (ENOMEM); > > - MCLREFDEBUGN((n), __FILE__, __LINE__); > -} > + refcnt_init(&mrefs->refs); > + mrefs->arg = m->m_ext.ext_arg; > + mrefs->free_fn = m->m_ext.ext_free_fn; > + mrefs->zero = 0; > > -static inline u_int > -m_extunref(struct mbuf *m) > -{ > - int refs = 0; > + m->m_ext.ext_arg = mrefs; > + m->m_ext.ext_free_fn = m_extfree_refs_fn; > + } > > - if (!MCLISREFERENCED(m)) > - return (0); > + refcnt_take(&mrefs->refs); > > - mtx_enter(&m_extref_mtx); > - if (MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_ext.ext_prevref = > - m->m_ext.ext_prevref; > - m->m_ext.ext_prevref->m_ext.ext_nextref = > - m->m_ext.ext_nextref; > - refs = 1; > - } > - mtx_leave(&m_extref_mtx); > + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size, > + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs); > > - return (refs); > + return (0); > } > > /* > @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_ > return num_extfree_fns++; > } > > -void > +static void > m_extfree(struct mbuf *m) > { > - if (m_extunref(m) == 0) { > - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > - m->m_ext.ext_size, m->m_ext.ext_arg); > - } > - > + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > + m->m_ext.ext_size, m->m_ext.ext_arg); > + > m->m_flags &= ~(M_EXT|M_EXTWR); > } > > @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le > } > n->m_len = min(len, m->m_len - off); > if (m->m_flags & M_EXT) { > + if (m_extref(m, n, wait) != 0) > + goto nospace; > n->m_data = m->m_data + off; > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > } else { > n->m_data += m->m_data - > (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); > @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w > if (m == NULL) > return (NULL); > remain = m->m_len - len; > + olen = m0->m_pkthdr.len; > if (m0->m_flags & M_PKTHDR) { > MGETHDR(n, wait, m0->m_type); > if (n == NULL) > @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w > return (NULL); > } > n->m_pkthdr.len -= len0; > - olen = m0->m_pkthdr.len; > m0->m_pkthdr.len = len0; > if (remain == 0) { > n->m_next = m->m_next; > @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w > return (NULL); > } > if (m->m_flags & M_EXT) { > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > + if (m_extref(m, n, wait) != 0) { > + m_freem(n); > + m0->m_pkthdr.len = olen; > + return (NULL); > + } > n->m_data = m->m_data + len; > } else { > m_align(n, remain); > @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off) > void > m_zero(struct mbuf *m) > { > - if (M_READONLY(m)) { > - mtx_enter(&m_extref_mtx); > - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; > - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; > - } > - mtx_leave(&m_extref_mtx); > + if (ISSET(m->m_flags, M_EXT) && > + m->m_ext.ext_free_fn == m_extfree_refs_fn) { > + struct m_ext_refs *mrefs = m->m_ext.ext_arg; > + > + /* > + * this variable only transitions in one direction, > + * so if there is a race it will be toward the same > + * result and therefore there is no loss. > + */ > + > + mrefs->zero = 1; > return; > } > > @@ -1525,9 +1561,7 @@ m_print(void *v, > m->m_ext.ext_buf, m->m_ext.ext_size); > (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", > m->m_ext.ext_free_fn, m->m_ext.ext_arg); > - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", > - m->m_ext.ext_nextref, m->m_ext.ext_prevref); > - > + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */ > } > } > > Index: sys/mbuf.h > =================================================================== > RCS file: /cvs/src/sys/sys/mbuf.h,v > diff -u -p -r1.269 mbuf.h > --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 > +++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000 > @@ -145,8 +145,6 @@ struct mbuf_ext { > void *ext_arg; > u_int ext_free_fn; /* index of free function */ > u_int ext_size; /* size of buffer, for ext_free_fn */ > - struct mbuf *ext_nextref; > - struct mbuf *ext_prevref; > #ifdef DEBUG > const char *ext_ofile; > const char *ext_nfile; > @@ -282,13 +280,22 @@ struct mbuf { > #define MCLREFDEBUGO(m, file, line) > #endif > > -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) > +int m_ext_refs_shared(struct mbuf *); > > -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) > +static inline int > +m_extreferenced(struct mbuf *m) > +{ > + extern u_int m_extfree_refs_fn; > + > + if (m->m_ext.ext_free_fn != m_extfree_refs_fn) > + return (0); > + > + return (m_ext_refs_shared(m)); > +} > + > +#define MCLISREFERENCED(m) m_extreferenced(m) > > #define MCLINITREFERENCE(m) do { \ > - (m)->m_ext.ext_prevref = (m); \ > - (m)->m_ext.ext_nextref = (m); \ > MCLREFDEBUGO((m), __FILE__, __LINE__); \ > MCLREFDEBUGN((m), NULL, 0); \ > } while (/* CONSTCOND */ 0) > @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *); > int m_trailingspace(struct mbuf *); > void m_align(struct mbuf *, int); > struct mbuf *m_clget(struct mbuf *, int, u_int); > -void m_extref(struct mbuf *, struct mbuf *); > void m_pool_init(struct pool *, u_int, u_int, const char *); > u_int m_pool_used(void); > void m_extfree_pool(caddr_t, u_int, void *); > >