Download raw body.
mbuf cluster m_extref_mtx contention
On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> this is the proxy ref thing if you want to give it a go. the code is a
> lot simpler at least.
I have tested both variants of the diff again and with this one I
got a panic. It was the sixth repetition of my test runs, this
time with btrace profiling turned on.
[-- MARK -- Mon Mar 16 23:30:00 2026]
smr: dispatch took 8.999999s
[-- MARK -- Mon Mar 16 23:35:00 2026]
[-- MARK -- Mon Mar 16 23:40:00 2026]
[-- MARK -- Mon Mar 16 23:45:00 2026]
[-- MARK -- Mon Mar 16 23:50:00 2026]
WWuuvARAvmNR_NImfIuuvNa_vNmumGl_f: tGaS(:u_P0flLfatxa u(Sl0uPtx fflL(fN 0ftNxfOOfffT(Tfff fLf0ffOfdWfxf9E8R2f fEbfLff0D4ppppppppp4 paaaaaapaa9On9NapOnnnanniiinii 9cSdnaWcccin0:ic:YEanni::c c: S0ci Rnii: C0:c: Eic c A, : L 0 : Lx 1 1e 39 33 7 31 3 k D :c Eb O X8 e , r N: I 0 , n e T l dSY 0 iS a g 9 n
StoCALppeL 1d a3t3 3 EsaXvIeTc 0 tx9+
0xae: movl $0,%gs:0x688
TID PID UID PRFLAGS PFLAGS CPU COMMAND
143474 68367 0 0x2 0x80 14 btrace
89473 63102 0 0x100000 0 13 udpbench
1403 76119 0 0x100000 0 5 udpbench
386603 96549 0 0x100000 0 8 udpbench
*324301 80498 0 0x100000 0 12 udpbench
320043 65374 0 0x100000 0 2 udpbench
38951 54248 0 0x100000 0 9 udpbench
168506 2350 0 0x100000 0 6 udpbench
52911 10331 0 0x100000 0 3 udpbench
313170 79129 0 0x100000 0 11 udpbench
151831 75539 0 0x100000 0 10 udpbench
375649 58184 0 0x14000 0x200 4 softnet7
402131 25075 0 0x14000 0x200 1 softnet2
418053 24476 0 0x14000 0x200 7 softnet1
savectx() at savectx+0xae
end of kernel
end trace frame: 0x75df8c7afde0, count: 14
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports. Insufficient info makes it difficult to find and fix bugs.
ddb{12}>
ddb{12}> show panic
cpu29: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
cpu22: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
cpu20: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
cpu13: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 1) -> e
cpu11: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
cpu10: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
cpu9: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
cpu8: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e
cpu7: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
cpu6: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e
cpu4: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
cpu3: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
*cpu2: uvm_fault(0xfffffd904999d000, 0x1e93713b8, 0, 1) -> e
ddb{2}> trace
x86_ipi_db(ffff80005a5f3ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
x86_bus_space_io_read_1(3f8,5) at x86_bus_space_io_read_1+0x19
comcnputc(800,20) at comcnputc+0x13f
cnputc(20) at cnputc+0x47
db_putchar(31) at db_putchar+0x3ea
kprintf() at kprintf+0x12ff
db_printf(ffffffff826121a5) at db_printf+0x6d
fault(ffffffff826cf673) at fault+0x97
kpageflttrap(ffff8000645271c0,1e93713b8) at kpageflttrap+0x1aa
kerntrap(ffff8000645271c0) at kerntrap+0xcf
alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
refcnt_rele(fffffd8f07072650) at refcnt_rele+0x78
m_extfree_refs(fffffd800f37b800,800,fffffd8f07072640) at m_extfree_refs+0x31
m_free(fffffd8089ce3800) at m_free+0x170
m_freem(fffffd8073abf000) at m_freem+0x38
ifq_enqueue(ffff800002c78c00,fffffd8073abf000) at ifq_enqueue+0x98
if_enqueue_ifq(ffff800000393048,fffffd8073abf000) at if_enqueue_ifq+0x66
ether_output(ffff800000393048,fffffd8073abf000,fffffd8f05720460,0) at ether_output+0x9d
if_output_ml(ffff800000393048,ffff8000645274f8,fffffd8f05720460,0) at if_output_ml+0x5e
ip6_output(fffffd8082a49600,0,fffffd8f05720448,0,ffff800004f002b0,fffffd8f057204e0) at ip6_output+0x10d4
udp6_output(fffffd8f057203d8,fffffd8082a49600,0,0) at udp6_output+0x2ba
sosend(ffff800004f226a8,0,ffff800064527860,0,0,0) at sosend+0x432
sendit(ffff8000634f1298,3,ffff800064527948,0,ffff8000645279e0) at sendit+0x39a
sys_sendto(ffff8000634f1298,ffff800064527a60,ffff8000645279e0) at sys_sendto+0x6c
syscall(ffff800064527a60) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x75df8c7afde0, count: -28
ddb{2}> show register
rdi 0xffff80005a5f3ff0
rsi 0
rbp 0xffff800064526d40
rbx 0xffffffff82ac8418 ipifunc+0x38
rdx 0
rcx 0x7
rax 0xffffff7f
r8 0
r9 0
r10 0
r11 0x8c8668458d7cd5f3
r12 0x7
r13 0
r14 0xffff80005a5f3ff0
r15 0
rip 0xffffffff81680b46 x86_ipi_db+0x16
cs 0x8
rflags 0x286
rsp 0xffff800064526d30
ss 0x10
x86_ipi_db+0x16: leave
ddb{2}> show mbuf 0xfffffd8073abf000
mbuf 0xfffffd8073abf000
m_type: 32159 m_flags: 6b6c<M_EOR,M_EXTWR,M_VLANTAG,M_LOOP,M_BCAST,M_MCAST,M_AUTH,M_ZEROIZE,M_COMP>
m_next: 0xfffffd8074560100 m_nextpkt: 0x80001ad
m_data: 0x429402116efa7f87 m_len: 2149814396
m_dat: 0xfffffd8073abf020 m_pktdat: 0xfffffd8073abf070
ddb{2}> show mbuf 0xfffffd8089ce3800
mbuf 0xfffffd8089ce3800
m_type: 1 m_flags: 9<M_EXT,M_EXTWR>
m_next: 0x0 m_nextpkt: 0x0
m_data: 0xfffffd800f37bd44 m_len: 228
m_dat: 0xfffffd8089ce3820 m_pktdat: 0xfffffd8089ce3870
m_ext.ext_buf: 0xfffffd800f37b800 m_ext.ext_size: 2048
m_ext.ext_free_fn: 1 m_ext.ext_arg: 0xfffffd8f07072640
Not sure what is going on. I continue testing and try to reproduce.
bluhm
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
> +++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000
> @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level
> int max_protohdr; /* largest protocol header */
> int max_hdr; /* largest link+protocol header */
>
> -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +struct m_ext_refs {
> + void *arg;
> + u_int free_fn;
> + u_int zero;
> + struct refcnt refs;
> +};
> +
> +static struct pool m_ext_refs_pool;
> +
> +static void m_extfree_refs(caddr_t, u_int, void *);
> +u_int m_extfree_refs_fn;
> +
> +static int m_extref(struct mbuf *, struct mbuf *, int);
> +static void m_extfree(struct mbuf *);
>
> -void m_extfree(struct mbuf *);
> void m_zero(struct mbuf *);
>
> unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
> @@ -174,6 +186,8 @@ mbinit(void)
>
> pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
> IPL_NET, 0, "mtagpl", NULL);
> + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
> + IPL_NET, 0, "mextrefs", NULL);
>
> for (i = 0; i < nitems(mclsizes); i++) {
> lowbits = mclsizes[i] & ((1 << 10) - 1);
> @@ -193,6 +207,7 @@ mbinit(void)
>
> (void)mextfree_register(m_extfree_pool);
> KASSERT(num_extfree_fns == 1);
> + m_extfree_refs_fn = mextfree_register(m_extfree_refs);
> }
>
> void
> @@ -204,6 +219,7 @@ mbcpuinit(void)
>
> pool_cache_init(&mbpool);
> pool_cache_init(&mtagpool);
> + pool_cache_init(&m_ext_refs_pool);
>
> for (i = 0; i < nitems(mclsizes); i++)
> pool_cache_init(&mclpools[i]);
> @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size,
> pool_put(pp, buf);
> }
>
> +int
> +m_ext_refs_shared(struct mbuf *m)
> +{
> + struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> + return (refcnt_shared(&mrefs->refs));
> +}
> +
> +static void
> +m_extfree_refs(caddr_t buf, u_int size, void *arg)
> +{
> + struct m_ext_refs *mrefs = arg;
> +
> + if (refcnt_rele(&mrefs->refs)) {
> + if (mrefs->zero)
> + explicit_bzero(buf, size);
> +
> + KASSERT(mrefs->free_fn < num_extfree_fns);
> + KASSERT(mrefs->free_fn != m_extfree_refs_fn);
> +
> + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
> +
> + pool_put(&m_ext_refs_pool, mrefs);
> + }
> +}
> +
> struct mbuf *
> m_free(struct mbuf *m)
> {
> @@ -434,44 +476,33 @@ m_free(struct mbuf *m)
> return (n);
> }
>
> -void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +static int
> +m_extref(struct mbuf *m, struct mbuf *n, int how)
> {
> - int refs = MCLISREFERENCED(o);
> -
> - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> + struct m_ext_refs *mrefs;
>
> - if (refs)
> - mtx_enter(&m_extref_mtx);
> - n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> - n->m_ext.ext_prevref = o;
> - o->m_ext.ext_nextref = n;
> - n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> - if (refs)
> - mtx_leave(&m_extref_mtx);
> + if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
> + mrefs = m->m_ext.ext_arg;
> + else {
> + mrefs = pool_get(&m_ext_refs_pool, how);
> + if (mrefs == NULL)
> + return (ENOMEM);
>
> - MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> + refcnt_init(&mrefs->refs);
> + mrefs->arg = m->m_ext.ext_arg;
> + mrefs->free_fn = m->m_ext.ext_free_fn;
> + mrefs->zero = 0;
>
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> - int refs = 0;
> + m->m_ext.ext_arg = mrefs;
> + m->m_ext.ext_free_fn = m_extfree_refs_fn;
> + }
>
> - if (!MCLISREFERENCED(m))
> - return (0);
> + refcnt_take(&mrefs->refs);
>
> - mtx_enter(&m_extref_mtx);
> - if (MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_ext.ext_prevref =
> - m->m_ext.ext_prevref;
> - m->m_ext.ext_prevref->m_ext.ext_nextref =
> - m->m_ext.ext_nextref;
> - refs = 1;
> - }
> - mtx_leave(&m_extref_mtx);
> + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
> + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
>
> - return (refs);
> + return (0);
> }
>
> /*
> @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
> return num_extfree_fns++;
> }
>
> -void
> +static void
> m_extfree(struct mbuf *m)
> {
> - if (m_extunref(m) == 0) {
> - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> - m->m_ext.ext_size, m->m_ext.ext_arg);
> - }
> -
> + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> + m->m_ext.ext_size, m->m_ext.ext_arg);
> +
> m->m_flags &= ~(M_EXT|M_EXTWR);
> }
>
> @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
> }
> n->m_len = min(len, m->m_len - off);
> if (m->m_flags & M_EXT) {
> + if (m_extref(m, n, wait) != 0)
> + goto nospace;
> n->m_data = m->m_data + off;
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> } else {
> n->m_data += m->m_data -
> (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
> if (m == NULL)
> return (NULL);
> remain = m->m_len - len;
> + olen = m0->m_pkthdr.len;
> if (m0->m_flags & M_PKTHDR) {
> MGETHDR(n, wait, m0->m_type);
> if (n == NULL)
> @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
> return (NULL);
> }
> n->m_pkthdr.len -= len0;
> - olen = m0->m_pkthdr.len;
> m0->m_pkthdr.len = len0;
> if (remain == 0) {
> n->m_next = m->m_next;
> @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
> return (NULL);
> }
> if (m->m_flags & M_EXT) {
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> + if (m_extref(m, n, wait) != 0) {
> + m_freem(n);
> + m0->m_pkthdr.len = olen;
> + return (NULL);
> + }
> n->m_data = m->m_data + len;
> } else {
> m_align(n, remain);
> @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
> void
> m_zero(struct mbuf *m)
> {
> - if (M_READONLY(m)) {
> - mtx_enter(&m_extref_mtx);
> - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> - }
> - mtx_leave(&m_extref_mtx);
> + if (ISSET(m->m_flags, M_EXT) &&
> + m->m_ext.ext_free_fn == m_extfree_refs_fn) {
> + struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> + /*
> + * this variable only transitions in one direction,
> + * so if there is a race it will be toward the same
> + * result and therefore there is no loss.
> + */
> +
> + mrefs->zero = 1;
> return;
> }
>
> @@ -1525,9 +1561,7 @@ m_print(void *v,
> m->m_ext.ext_buf, m->m_ext.ext_size);
> (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
> m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> - m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
> }
> }
>
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
> +++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000
> @@ -145,8 +145,6 @@ struct mbuf_ext {
> void *ext_arg;
> u_int ext_free_fn; /* index of free function */
> u_int ext_size; /* size of buffer, for ext_free_fn */
> - struct mbuf *ext_nextref;
> - struct mbuf *ext_prevref;
> #ifdef DEBUG
> const char *ext_ofile;
> const char *ext_nfile;
> @@ -282,13 +280,22 @@ struct mbuf {
> #define MCLREFDEBUGO(m, file, line)
> #endif
>
> -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
> +int m_ext_refs_shared(struct mbuf *);
>
> -#define MCLADDREFERENCE(o, n) m_extref((o), (n))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> + extern u_int m_extfree_refs_fn;
> +
> + if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
> + return (0);
> +
> + return (m_ext_refs_shared(m));
> +}
> +
> +#define MCLISREFERENCED(m) m_extreferenced(m)
>
> #define MCLINITREFERENCE(m) do { \
> - (m)->m_ext.ext_prevref = (m); \
> - (m)->m_ext.ext_nextref = (m); \
> MCLREFDEBUGO((m), __FILE__, __LINE__); \
> MCLREFDEBUGN((m), NULL, 0); \
> } while (/* CONSTCOND */ 0)
> @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *);
> int m_trailingspace(struct mbuf *);
> void m_align(struct mbuf *, int);
> struct mbuf *m_clget(struct mbuf *, int, u_int);
> -void m_extref(struct mbuf *, struct mbuf *);
> void m_pool_init(struct pool *, u_int, u_int, const char *);
> u_int m_pool_used(void);
> void m_extfree_pool(caddr_t, u_int, void *);
>
>
mbuf cluster m_extref_mtx contention