Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Tue, 17 Mar 2026 14:24:01 +0100

Download raw body.

Thread
On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> this is the proxy ref thing if you want to give it a go. the code is a
> lot simpler at least.

I have tested both variants of the diff again and with this one I
got a panic.  It was the sixth repetition of my test runs, this
time with btrace profiling turned on.

[-- MARK -- Mon Mar 16 23:30:00 2026]
smr: dispatch took 8.999999s
[-- MARK -- Mon Mar 16 23:35:00 2026]
[-- MARK -- Mon Mar 16 23:40:00 2026]
[-- MARK -- Mon Mar 16 23:45:00 2026]
[-- MARK -- Mon Mar 16 23:50:00 2026]
WWuuvARAvmNR_NImfIuuvNa_vNmumGl_f: tGaS(:u_P0flLfatxa u(Sl0uPtx fflL(fN 0ftNxfOOfffT(Tfff fLf0ffOfdWfxf9E8R2f fEbfLff0D4ppppppppp4 paaaaaapaa9On9NapOnnnanniiinii 9cSdnaWcccin0:ic:YEanni::c   c: S0ci  Rnii:   C0:c:  Eic c       A,  :    L  0  : Lx      1     1e    39     33      7    31     3   k D   :c       Eb O   X8 e     ,  r N: I           0 , n    e      T    l                dSY 0     iS               a    g     9        n
               StoCALppeL 1d a3t3 3  EsaXvIeTc 0 tx9+
0xae:   movl    $0,%gs:0x688
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
 143474  68367      0         0x2       0x80   14  btrace
  89473  63102      0    0x100000          0   13  udpbench
   1403  76119      0    0x100000          0    5  udpbench
 386603  96549      0    0x100000          0    8  udpbench
*324301  80498      0    0x100000          0   12  udpbench
 320043  65374      0    0x100000          0    2  udpbench
  38951  54248      0    0x100000          0    9  udpbench
 168506   2350      0    0x100000          0    6  udpbench
  52911  10331      0    0x100000          0    3  udpbench
 313170  79129      0    0x100000          0   11  udpbench
 151831  75539      0    0x100000          0   10  udpbench
 375649  58184      0     0x14000      0x200    4  softnet7
 402131  25075      0     0x14000      0x200    1  softnet2
 418053  24476      0     0x14000      0x200    7  softnet1
savectx() at savectx+0xae
end of kernel
end trace frame: 0x75df8c7afde0, count: 14
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{12}>

ddb{12}> show panic
 cpu29: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
 cpu22: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
 cpu20: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
 cpu13: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 1) -> e
 cpu11: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
 cpu10: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_so
ftintr.c", line 71
 cpu9: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
 cpu8: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e
 cpu7: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
 cpu6: uvm_fault(0xffffffff82b4bc10, 0xffffffffffffffff, 0, 2) -> e
 cpu4: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
 cpu3: kernel diagnostic assertion "(sih->sih_state & (SIS_PENDING | SIS_RESTART)) == SIS_PENDING" failed: file "/usr/src/sys/kern/kern_sof
tintr.c", line 71
*cpu2: uvm_fault(0xfffffd904999d000, 0x1e93713b8, 0, 1) -> e

ddb{2}> trace
x86_ipi_db(ffff80005a5f3ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
x86_bus_space_io_read_1(3f8,5) at x86_bus_space_io_read_1+0x19
comcnputc(800,20) at comcnputc+0x13f
cnputc(20) at cnputc+0x47
db_putchar(31) at db_putchar+0x3ea
kprintf() at kprintf+0x12ff
db_printf(ffffffff826121a5) at db_printf+0x6d
fault(ffffffff826cf673) at fault+0x97
kpageflttrap(ffff8000645271c0,1e93713b8) at kpageflttrap+0x1aa
kerntrap(ffff8000645271c0) at kerntrap+0xcf
alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
refcnt_rele(fffffd8f07072650) at refcnt_rele+0x78
m_extfree_refs(fffffd800f37b800,800,fffffd8f07072640) at m_extfree_refs+0x31
m_free(fffffd8089ce3800) at m_free+0x170
m_freem(fffffd8073abf000) at m_freem+0x38
ifq_enqueue(ffff800002c78c00,fffffd8073abf000) at ifq_enqueue+0x98
if_enqueue_ifq(ffff800000393048,fffffd8073abf000) at if_enqueue_ifq+0x66
ether_output(ffff800000393048,fffffd8073abf000,fffffd8f05720460,0) at ether_output+0x9d
if_output_ml(ffff800000393048,ffff8000645274f8,fffffd8f05720460,0) at if_output_ml+0x5e
ip6_output(fffffd8082a49600,0,fffffd8f05720448,0,ffff800004f002b0,fffffd8f057204e0) at ip6_output+0x10d4
udp6_output(fffffd8f057203d8,fffffd8082a49600,0,0) at udp6_output+0x2ba
sosend(ffff800004f226a8,0,ffff800064527860,0,0,0) at sosend+0x432
sendit(ffff8000634f1298,3,ffff800064527948,0,ffff8000645279e0) at sendit+0x39a
sys_sendto(ffff8000634f1298,ffff800064527a60,ffff8000645279e0) at sys_sendto+0x6c
syscall(ffff800064527a60) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x75df8c7afde0, count: -28

ddb{2}> show register
rdi               0xffff80005a5f3ff0
rsi                                0
rbp               0xffff800064526d40
rbx               0xffffffff82ac8418    ipifunc+0x38
rdx                                0
rcx                              0x7
rax                       0xffffff7f
r8                                 0
r9                                 0
r10                                0
r11               0x8c8668458d7cd5f3
r12                              0x7
r13                                0
r14               0xffff80005a5f3ff0
r15                                0
rip               0xffffffff81680b46    x86_ipi_db+0x16
cs                               0x8
rflags                         0x286
rsp               0xffff800064526d30
ss                              0x10
x86_ipi_db+0x16:        leave

ddb{2}> show mbuf 0xfffffd8073abf000
mbuf 0xfffffd8073abf000
m_type: 32159   m_flags: 6b6c<M_EOR,M_EXTWR,M_VLANTAG,M_LOOP,M_BCAST,M_MCAST,M_AUTH,M_ZEROIZE,M_COMP>
m_next: 0xfffffd8074560100      m_nextpkt: 0x80001ad
m_data: 0x429402116efa7f87      m_len: 2149814396
m_dat: 0xfffffd8073abf020       m_pktdat: 0xfffffd8073abf070

ddb{2}> show mbuf 0xfffffd8089ce3800
mbuf 0xfffffd8089ce3800
m_type: 1       m_flags: 9<M_EXT,M_EXTWR>
m_next: 0x0     m_nextpkt: 0x0
m_data: 0xfffffd800f37bd44      m_len: 228
m_dat: 0xfffffd8089ce3820       m_pktdat: 0xfffffd8089ce3870
m_ext.ext_buf: 0xfffffd800f37b800       m_ext.ext_size: 2048
m_ext.ext_free_fn: 1    m_ext.ext_arg: 0xfffffd8f07072640

Not sure what is going on.  I continue testing and try to reproduce.

bluhm

> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
> +++ kern/uipc_mbuf.c	11 Feb 2026 01:29:41 -0000
> @@ -123,9 +123,21 @@ int max_linkhdr;		/* largest link-level 
>  int max_protohdr;		/* largest protocol header */
>  int max_hdr;			/* largest link+protocol header */
>  
> -struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +struct m_ext_refs {
> +	void		*arg;
> +	u_int		 free_fn;
> +	u_int		 zero;
> +	struct refcnt	 refs;
> +};
> +
> +static struct pool	m_ext_refs_pool;
> +
> +static void		m_extfree_refs(caddr_t, u_int, void *);
> +u_int			m_extfree_refs_fn;
> +
> +static int		m_extref(struct mbuf *, struct mbuf *, int);
> +static void		m_extfree(struct mbuf *);
>  
> -void	m_extfree(struct mbuf *);
>  void	m_zero(struct mbuf *);
>  
>  unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
> @@ -174,6 +186,8 @@ mbinit(void)
>  
>  	pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
>  	    IPL_NET, 0, "mtagpl", NULL);
> +	pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
> +	    IPL_NET, 0, "mextrefs", NULL);
>  
>  	for (i = 0; i < nitems(mclsizes); i++) {
>  		lowbits = mclsizes[i] & ((1 << 10) - 1);
> @@ -193,6 +207,7 @@ mbinit(void)
>  
>  	(void)mextfree_register(m_extfree_pool);
>  	KASSERT(num_extfree_fns == 1);
> +	m_extfree_refs_fn = mextfree_register(m_extfree_refs);
>  }
>  
>  void
> @@ -204,6 +219,7 @@ mbcpuinit(void)
>  
>  	pool_cache_init(&mbpool);
>  	pool_cache_init(&mtagpool);
> +	pool_cache_init(&m_ext_refs_pool);
>  
>  	for (i = 0; i < nitems(mclsizes); i++)
>  		pool_cache_init(&mclpools[i]);
> @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, 
>  	pool_put(pp, buf);
>  }
>  
> +int
> +m_ext_refs_shared(struct mbuf *m)
> +{
> +	struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> +	return (refcnt_shared(&mrefs->refs));
> +}
> +
> +static void
> +m_extfree_refs(caddr_t buf, u_int size, void *arg)
> +{
> +	struct m_ext_refs *mrefs = arg;
> +
> +	if (refcnt_rele(&mrefs->refs)) {
> +		if (mrefs->zero)
> +			explicit_bzero(buf, size);
> +
> +		KASSERT(mrefs->free_fn < num_extfree_fns);
> +		KASSERT(mrefs->free_fn != m_extfree_refs_fn);
> +
> +		mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
> +
> +		pool_put(&m_ext_refs_pool, mrefs);
> +	}
> +}
> +
>  struct mbuf *
>  m_free(struct mbuf *m)
>  {
> @@ -434,44 +476,33 @@ m_free(struct mbuf *m)
>  	return (n);
>  }
>  
> -void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +static int
> +m_extref(struct mbuf *m, struct mbuf *n, int how)
>  {
> -	int refs = MCLISREFERENCED(o);
> -
> -	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> +	struct m_ext_refs *mrefs;
>  
> -	if (refs)
> -		mtx_enter(&m_extref_mtx);
> -	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> -	n->m_ext.ext_prevref = o;
> -	o->m_ext.ext_nextref = n;
> -	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> -	if (refs)
> -		mtx_leave(&m_extref_mtx);
> +	if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
> +		mrefs = m->m_ext.ext_arg;
> +	else {
> +		mrefs = pool_get(&m_ext_refs_pool, how);
> +		if (mrefs == NULL)
> +			return (ENOMEM);
>  
> -	MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> +		refcnt_init(&mrefs->refs);
> +		mrefs->arg = m->m_ext.ext_arg;
> +		mrefs->free_fn = m->m_ext.ext_free_fn;
> +		mrefs->zero = 0;
>  
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> -	int refs = 0;
> +		m->m_ext.ext_arg = mrefs;
> +		m->m_ext.ext_free_fn = m_extfree_refs_fn;
> +	}
>  
> -	if (!MCLISREFERENCED(m))
> -		return (0);
> +	refcnt_take(&mrefs->refs);
>  
> -	mtx_enter(&m_extref_mtx);
> -	if (MCLISREFERENCED(m)) {
> -		m->m_ext.ext_nextref->m_ext.ext_prevref =
> -		    m->m_ext.ext_prevref;
> -		m->m_ext.ext_prevref->m_ext.ext_nextref =
> -		    m->m_ext.ext_nextref;
> -		refs = 1;
> -	}
> -	mtx_leave(&m_extref_mtx);
> +	MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
> +	    m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
>  
> -	return (refs);
> +	return (0);
>  }
>  
>  /*
> @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
>  	return num_extfree_fns++;
>  }
>  
> -void
> +static void
>  m_extfree(struct mbuf *m)
>  {
> -	if (m_extunref(m) == 0) {
> -		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> -		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> -		    m->m_ext.ext_size, m->m_ext.ext_arg);
> -	}
> -
> +	KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> +	mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> +	    m->m_ext.ext_size, m->m_ext.ext_arg);
> + 
>  	m->m_flags &= ~(M_EXT|M_EXTWR);
>  }
>  
> @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
>  		}
>  		n->m_len = min(len, m->m_len - off);
>  		if (m->m_flags & M_EXT) {
> +			if (m_extref(m, n, wait) != 0)
> +				goto nospace;
>  			n->m_data = m->m_data + off;
> -			n->m_ext = m->m_ext;
> -			MCLADDREFERENCE(m, n);
>  		} else {
>  			n->m_data += m->m_data -
>  			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
>  	if (m == NULL)
>  		return (NULL);
>  	remain = m->m_len - len;
> +	olen = m0->m_pkthdr.len;
>  	if (m0->m_flags & M_PKTHDR) {
>  		MGETHDR(n, wait, m0->m_type);
>  		if (n == NULL)
> @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
>  			return (NULL);
>  		}
>  		n->m_pkthdr.len -= len0;
> -		olen = m0->m_pkthdr.len;
>  		m0->m_pkthdr.len = len0;
>  		if (remain == 0) {
>  			n->m_next = m->m_next;
> @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
>  			return (NULL);
>  	}
>  	if (m->m_flags & M_EXT) {
> -		n->m_ext = m->m_ext;
> -		MCLADDREFERENCE(m, n);
> +		if (m_extref(m, n, wait) != 0) {
> +			m_freem(n);
> +			m0->m_pkthdr.len = olen;
> +			return (NULL);
> +		}
>  		n->m_data = m->m_data + len;
>  	} else {
>  		m_align(n, remain);
> @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
>  void
>  m_zero(struct mbuf *m)
>  {
> -	if (M_READONLY(m)) {
> -		mtx_enter(&m_extref_mtx);
> -		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> -			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> -			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> -		}
> -		mtx_leave(&m_extref_mtx);
> +	if (ISSET(m->m_flags, M_EXT) &&
> +	    m->m_ext.ext_free_fn == m_extfree_refs_fn) {
> +		struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> +		/*
> +		 * this variable only transitions in one direction,
> +		 * so if there is a race it will be toward the same
> +		 * result and therefore there is no loss.
> +		 */
> +
> +		mrefs->zero = 1;
>  		return;
>  	}
>  
> @@ -1525,9 +1561,7 @@ m_print(void *v,
>  		    m->m_ext.ext_buf, m->m_ext.ext_size);
>  		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
>  		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> -		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> -		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> +		/* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
>  	}
>  }
>  
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
> +++ sys/mbuf.h	11 Feb 2026 01:29:41 -0000
> @@ -145,8 +145,6 @@ struct mbuf_ext {
>  	void	*ext_arg;
>  	u_int	ext_free_fn;		/* index of free function */
>  	u_int	ext_size;		/* size of buffer, for ext_free_fn */
> -	struct mbuf *ext_nextref;
> -	struct mbuf *ext_prevref;
>  #ifdef DEBUG
>  	const char *ext_ofile;
>  	const char *ext_nfile;
> @@ -282,13 +280,22 @@ struct mbuf {
>  #define MCLREFDEBUGO(m, file, line)
>  #endif
>  
> -#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
> +int	m_ext_refs_shared(struct mbuf *);
>  
> -#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> +	extern u_int m_extfree_refs_fn;
> +
> +	if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
> +		return (0);
> +
> +	return (m_ext_refs_shared(m));
> +}
> +
> +#define	MCLISREFERENCED(m)	m_extreferenced(m)
>  
>  #define	MCLINITREFERENCE(m)	do {					\
> -		(m)->m_ext.ext_prevref = (m);				\
> -		(m)->m_ext.ext_nextref = (m);				\
>  		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
>  		MCLREFDEBUGN((m), NULL, 0);				\
>  	} while (/* CONSTCOND */ 0)
> @@ -439,7 +446,6 @@ int	m_leadingspace(struct mbuf *);
>  int	m_trailingspace(struct mbuf *);
>  void	m_align(struct mbuf *, int);
>  struct mbuf *m_clget(struct mbuf *, int, u_int);
> -void	m_extref(struct mbuf *, struct mbuf *);
>  void	m_pool_init(struct pool *, u_int, u_int, const char *);
>  u_int	m_pool_used(void);
>  void	m_extfree_pool(caddr_t, u_int, void *);
> 
>