Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Wed, 11 Feb 2026 17:42:39 -0500

Download raw body.

Thread
On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote:
> > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> > > the approach freebsd appears to have taken here is reference counting
> > > the shared cluster. one of the mbufs is picked to contain a reference
> > > counter, and the other mbufs pointed at it and use it to coordinate
> > > when the memory can be freed.
> > > 
> > > the diff below implements this idea. of course, most of the complexity
> > > is in handling the deferred free of the "leader" mbuf. it appears
> > > to work, but i haven't pushed it very hard.
> > 
> > Works great for me.  The socket splicing test that triggers the
> > problem goes from 30 to 45 GBit/sec throughput.  The contention on
> > m_extfree is gone.
> > 
> > before:
> > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> > after:
> > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> > 
> > > id argue m_zero() has an existing bug where it fiddles with another
> > > mbufs m_flags, but not all m_flags operations are serialised with that
> > > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> > > as needing zeroing instead.
> > 
> > Good catch.
> > 
> > > the other option i considered was to allocate something on demand to
> > > hold the cluster reference count, and have it proxy the ext_free_fn
> > > call.
> > 
> > Allocating the hot path is not a good idea.  We should avoid that.
> 
> unless it's cheap enough...
> 
> > 
> > Diff below is OK bluhm.  But I am still testing it on more machines.
> > 
> > bluhm
> 
> this is the proxy ref thing if you want to give it a go. the code is a
> lot simpler at least.

It also works and performance is simmilar.  But I like the other
diff without extra allocations much more.

All numbers are here, but the page is a bit messy.
http://bluhm.genua.de/netlink/results/2026-02-10T01:48:41Z/netlink.html

bluhm

> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
> +++ kern/uipc_mbuf.c	11 Feb 2026 01:29:41 -0000
> @@ -123,9 +123,21 @@ int max_linkhdr;		/* largest link-level 
>  int max_protohdr;		/* largest protocol header */
>  int max_hdr;			/* largest link+protocol header */
>  
> -struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +struct m_ext_refs {
> +	void		*arg;
> +	u_int		 free_fn;
> +	u_int		 zero;
> +	struct refcnt	 refs;
> +};
> +
> +static struct pool	m_ext_refs_pool;
> +
> +static void		m_extfree_refs(caddr_t, u_int, void *);
> +u_int			m_extfree_refs_fn;
> +
> +static int		m_extref(struct mbuf *, struct mbuf *, int);
> +static void		m_extfree(struct mbuf *);
>  
> -void	m_extfree(struct mbuf *);
>  void	m_zero(struct mbuf *);
>  
>  unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
> @@ -174,6 +186,8 @@ mbinit(void)
>  
>  	pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
>  	    IPL_NET, 0, "mtagpl", NULL);
> +	pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
> +	    IPL_NET, 0, "mextrefs", NULL);
>  
>  	for (i = 0; i < nitems(mclsizes); i++) {
>  		lowbits = mclsizes[i] & ((1 << 10) - 1);
> @@ -193,6 +207,7 @@ mbinit(void)
>  
>  	(void)mextfree_register(m_extfree_pool);
>  	KASSERT(num_extfree_fns == 1);
> +	m_extfree_refs_fn = mextfree_register(m_extfree_refs);
>  }
>  
>  void
> @@ -204,6 +219,7 @@ mbcpuinit(void)
>  
>  	pool_cache_init(&mbpool);
>  	pool_cache_init(&mtagpool);
> +	pool_cache_init(&m_ext_refs_pool);
>  
>  	for (i = 0; i < nitems(mclsizes); i++)
>  		pool_cache_init(&mclpools[i]);
> @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, 
>  	pool_put(pp, buf);
>  }
>  
> +int
> +m_ext_refs_shared(struct mbuf *m)
> +{
> +	struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> +	return (refcnt_shared(&mrefs->refs));
> +}
> +
> +static void
> +m_extfree_refs(caddr_t buf, u_int size, void *arg)
> +{
> +	struct m_ext_refs *mrefs = arg;
> +
> +	if (refcnt_rele(&mrefs->refs)) {
> +		if (mrefs->zero)
> +			explicit_bzero(buf, size);
> +
> +		KASSERT(mrefs->free_fn < num_extfree_fns);
> +		KASSERT(mrefs->free_fn != m_extfree_refs_fn);
> +
> +		mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
> +
> +		pool_put(&m_ext_refs_pool, mrefs);
> +	}
> +}
> +
>  struct mbuf *
>  m_free(struct mbuf *m)
>  {
> @@ -434,44 +476,33 @@ m_free(struct mbuf *m)
>  	return (n);
>  }
>  
> -void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +static int
> +m_extref(struct mbuf *m, struct mbuf *n, int how)
>  {
> -	int refs = MCLISREFERENCED(o);
> -
> -	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> +	struct m_ext_refs *mrefs;
>  
> -	if (refs)
> -		mtx_enter(&m_extref_mtx);
> -	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> -	n->m_ext.ext_prevref = o;
> -	o->m_ext.ext_nextref = n;
> -	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> -	if (refs)
> -		mtx_leave(&m_extref_mtx);
> +	if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
> +		mrefs = m->m_ext.ext_arg;
> +	else {
> +		mrefs = pool_get(&m_ext_refs_pool, how);
> +		if (mrefs == NULL)
> +			return (ENOMEM);
>  
> -	MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> +		refcnt_init(&mrefs->refs);
> +		mrefs->arg = m->m_ext.ext_arg;
> +		mrefs->free_fn = m->m_ext.ext_free_fn;
> +		mrefs->zero = 0;
>  
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> -	int refs = 0;
> +		m->m_ext.ext_arg = mrefs;
> +		m->m_ext.ext_free_fn = m_extfree_refs_fn;
> +	}
>  
> -	if (!MCLISREFERENCED(m))
> -		return (0);
> +	refcnt_take(&mrefs->refs);
>  
> -	mtx_enter(&m_extref_mtx);
> -	if (MCLISREFERENCED(m)) {
> -		m->m_ext.ext_nextref->m_ext.ext_prevref =
> -		    m->m_ext.ext_prevref;
> -		m->m_ext.ext_prevref->m_ext.ext_nextref =
> -		    m->m_ext.ext_nextref;
> -		refs = 1;
> -	}
> -	mtx_leave(&m_extref_mtx);
> +	MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
> +	    m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
>  
> -	return (refs);
> +	return (0);
>  }
>  
>  /*
> @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
>  	return num_extfree_fns++;
>  }
>  
> -void
> +static void
>  m_extfree(struct mbuf *m)
>  {
> -	if (m_extunref(m) == 0) {
> -		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> -		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> -		    m->m_ext.ext_size, m->m_ext.ext_arg);
> -	}
> -
> +	KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> +	mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> +	    m->m_ext.ext_size, m->m_ext.ext_arg);
> + 
>  	m->m_flags &= ~(M_EXT|M_EXTWR);
>  }
>  
> @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
>  		}
>  		n->m_len = min(len, m->m_len - off);
>  		if (m->m_flags & M_EXT) {
> +			if (m_extref(m, n, wait) != 0)
> +				goto nospace;
>  			n->m_data = m->m_data + off;
> -			n->m_ext = m->m_ext;
> -			MCLADDREFERENCE(m, n);
>  		} else {
>  			n->m_data += m->m_data -
>  			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
>  	if (m == NULL)
>  		return (NULL);
>  	remain = m->m_len - len;
> +	olen = m0->m_pkthdr.len;
>  	if (m0->m_flags & M_PKTHDR) {
>  		MGETHDR(n, wait, m0->m_type);
>  		if (n == NULL)
> @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
>  			return (NULL);
>  		}
>  		n->m_pkthdr.len -= len0;
> -		olen = m0->m_pkthdr.len;
>  		m0->m_pkthdr.len = len0;
>  		if (remain == 0) {
>  			n->m_next = m->m_next;
> @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
>  			return (NULL);
>  	}
>  	if (m->m_flags & M_EXT) {
> -		n->m_ext = m->m_ext;
> -		MCLADDREFERENCE(m, n);
> +		if (m_extref(m, n, wait) != 0) {
> +			m_freem(n);
> +			m0->m_pkthdr.len = olen;
> +			return (NULL);
> +		}
>  		n->m_data = m->m_data + len;
>  	} else {
>  		m_align(n, remain);
> @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
>  void
>  m_zero(struct mbuf *m)
>  {
> -	if (M_READONLY(m)) {
> -		mtx_enter(&m_extref_mtx);
> -		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> -			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> -			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> -		}
> -		mtx_leave(&m_extref_mtx);
> +	if (ISSET(m->m_flags, M_EXT) &&
> +	    m->m_ext.ext_free_fn == m_extfree_refs_fn) {
> +		struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> +		/*
> +		 * this variable only transitions in one direction,
> +		 * so if there is a race it will be toward the same
> +		 * result and therefore there is no loss.
> +		 */
> +
> +		mrefs->zero = 1;
>  		return;
>  	}
>  
> @@ -1525,9 +1561,7 @@ m_print(void *v,
>  		    m->m_ext.ext_buf, m->m_ext.ext_size);
>  		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
>  		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> -		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> -		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> +		/* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
>  	}
>  }
>  
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
> +++ sys/mbuf.h	11 Feb 2026 01:29:41 -0000
> @@ -145,8 +145,6 @@ struct mbuf_ext {
>  	void	*ext_arg;
>  	u_int	ext_free_fn;		/* index of free function */
>  	u_int	ext_size;		/* size of buffer, for ext_free_fn */
> -	struct mbuf *ext_nextref;
> -	struct mbuf *ext_prevref;
>  #ifdef DEBUG
>  	const char *ext_ofile;
>  	const char *ext_nfile;
> @@ -282,13 +280,22 @@ struct mbuf {
>  #define MCLREFDEBUGO(m, file, line)
>  #endif
>  
> -#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
> +int	m_ext_refs_shared(struct mbuf *);
>  
> -#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> +	extern u_int m_extfree_refs_fn;
> +
> +	if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
> +		return (0);
> +
> +	return (m_ext_refs_shared(m));
> +}
> +
> +#define	MCLISREFERENCED(m)	m_extreferenced(m)
>  
>  #define	MCLINITREFERENCE(m)	do {					\
> -		(m)->m_ext.ext_prevref = (m);				\
> -		(m)->m_ext.ext_nextref = (m);				\
>  		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
>  		MCLREFDEBUGN((m), NULL, 0);				\
>  	} while (/* CONSTCOND */ 0)
> @@ -439,7 +446,6 @@ int	m_leadingspace(struct mbuf *);
>  int	m_trailingspace(struct mbuf *);
>  void	m_align(struct mbuf *, int);
>  struct mbuf *m_clget(struct mbuf *, int, u_int);
> -void	m_extref(struct mbuf *, struct mbuf *);
>  void	m_pool_init(struct pool *, u_int, u_int, const char *);
>  u_int	m_pool_used(void);
>  void	m_extfree_pool(caddr_t, u_int, void *);
> 
>