Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Tue, 10 Feb 2026 19:21:56 -0500

Download raw body.

Thread
On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> the approach freebsd appears to have taken here is reference counting
> the shared cluster. one of the mbufs is picked to contain a reference
> counter, and the other mbufs pointed at it and use it to coordinate
> when the memory can be freed.
> 
> the diff below implements this idea. of course, most of the complexity
> is in handling the deferred free of the "leader" mbuf. it appears
> to work, but i haven't pushed it very hard.

Works great for me.  The socket splicing test that triggers the
problem goes from 30 to 45 GBit/sec throughput.  The contention on
m_extfree is gone.

before:
http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
after:
http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree

> id argue m_zero() has an existing bug where it fiddles with another
> mbufs m_flags, but not all m_flags operations are serialised with that
> mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> as needing zeroing instead.

Good catch.

> the other option i considered was to allocate something on demand to
> hold the cluster reference count, and have it proxy the ext_free_fn
> call.

Allocating the hot path is not a good idea.  We should avoid that.

Diff below is OK bluhm.  But I am still testing it on more machines.

bluhm

> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
> +++ kern/uipc_mbuf.c	10 Feb 2026 06:34:04 -0000
> @@ -123,9 +123,8 @@ int max_linkhdr;		/* largest link-level 
>  int max_protohdr;		/* largest protocol header */
>  int max_hdr;			/* largest link+protocol header */
>  
> -struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +static int	m_extfree(struct mbuf *);
>  
> -void	m_extfree(struct mbuf *);
>  void	m_zero(struct mbuf *);
>  
>  unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
> @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator =
>  static void (*mextfree_fns[4])(caddr_t, u_int, void *);
>  static u_int num_extfree_fns;
>  
> +#define MEXT_FREE_ZERO	0x80000000
> +
>  #define M_DATABUF(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
>  			(m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
>  #define M_SIZE(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
> @@ -403,6 +404,7 @@ struct mbuf *
>  m_free(struct mbuf *m)
>  {
>  	struct mbuf *n;
> +	int refs = 0;
>  	int s;
>  
>  	if (m == NULL)
> @@ -427,51 +429,47 @@ m_free(struct mbuf *m)
>  #endif	/* NPF > 0 */
>  	}
>  	if (m->m_flags & M_EXT)
> -		m_extfree(m);
> +		refs = m_extfree(m);
>  
> -	pool_put(&mbpool, m);
> +	if (refs == 0)
> +		pool_put(&mbpool, m);
>  
>  	return (n);
>  }
>  
>  void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +m_extref(struct mbuf *m, struct mbuf *n)
>  {
> -	int refs = MCLISREFERENCED(o);
> -
> -	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> +	struct mbuf *mr;
>  
> -	if (refs)
> -		mtx_enter(&m_extref_mtx);
> -	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> -	n->m_ext.ext_prevref = o;
> -	o->m_ext.ext_nextref = n;
> -	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> -	if (refs)
> -		mtx_leave(&m_extref_mtx);
> -
> -	MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> +	n->m_flags |= m->m_flags & (M_EXT|M_EXTWR);
> +	n->m_ext = m->m_ext;
>  
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> -	int refs = 0;
> +	/*
> +	 * point all the mbufs sharing a cluster at one of them which is
> +	 * the "leader" via the ext_refm pointer. an mbuf can
> +	 * identify itself as the leader if ext_refm points at itself.
> +	 *
> +	 * the default state for mbufs with clusters is to not set
> +	 * ext_refm so it can avoid doing the reference counting.
> +	 *
> +	 * if an mbuf becomes a leader, it can't be freed until all
> +	 * the other mbufs referring to it have been freed. similarly,
> +	 * the cluster can't be removed from a leader until the other
> +	 * references are dropped.
> +	 */
>  
> -	if (!MCLISREFERENCED(m))
> -		return (0);
> +	mr = m->m_ext.ext_refm;
> +	if (mr == NULL) {
> +		mr = m;
> +		m->m_ext.ext_refs = 2;
> +		m->m_ext.ext_refm = mr; /* mark m as the leader */
> +	} else
> +		atomic_inc_int(&mr->m_ext.ext_refs);
>  
> -	mtx_enter(&m_extref_mtx);
> -	if (MCLISREFERENCED(m)) {
> -		m->m_ext.ext_nextref->m_ext.ext_prevref =
> -		    m->m_ext.ext_prevref;
> -		m->m_ext.ext_prevref->m_ext.ext_nextref =
> -		    m->m_ext.ext_nextref;
> -		refs = 1;
> -	}
> -	mtx_leave(&m_extref_mtx);
> +	n->m_ext.ext_refm = mr; /* point n at the leader */
>  
> -	return (refs);
> +	MCLREFDEBUGN((n), __FILE__, __LINE__);
>  }
>  
>  /*
> @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_
>  	return num_extfree_fns++;
>  }
>  
> -void
> +static int
>  m_extfree(struct mbuf *m)
>  {
> -	if (m_extunref(m) == 0) {
> -		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> -		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> -		    m->m_ext.ext_size, m->m_ext.ext_arg);
> +	struct mbuf *mr;
> +	unsigned int free_fn;
> +	unsigned int zero = 0;
> +
> +	mr = m->m_ext.ext_refm;
> +	if (mr == NULL)
> +		mr = m;
> +	else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) {
> +		/*
> +		 * the cluster is still referenced by another mbuf.
> +		 * the caller can't free this mbuf if it is the
> +		 * leader of a shared cluster.
> +		 */
> +		return (m == mr);
>  	}
>  
> -	m->m_flags &= ~(M_EXT|M_EXTWR);
> +	free_fn = mr->m_ext.ext_free_fn;
> +	if (ISSET(free_fn, MEXT_FREE_ZERO)) {
> +		zero = 1;
> +		CLR(free_fn, MEXT_FREE_ZERO);
> +	}
> +
> +	KASSERTMSG(free_fn < num_extfree_fns,
> +	    "mbuf %p ext_free_fn %u >= num_extfree_fns %u",
> +	    mr, free_fn, num_extfree_fns);
> +
> +	if (zero)
> +		explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size);
> +
> +	mextfree_fns[free_fn](mr->m_ext.ext_buf,
> +	    mr->m_ext.ext_size, m->m_ext.ext_arg);
> +
> +	if (mr != m)
> +		pool_put(&mbpool, mr);
> +
> +	return (0);
>  }
>  
>  struct mbuf *
> @@ -524,9 +551,8 @@ m_purge(struct mbuf *m)
>  }
>  
>  /*
> - * mbuf chain defragmenter. This function uses some evil tricks to defragment
> - * an mbuf chain into a single buffer without changing the mbuf pointer.
> - * This needs to know a lot of the mbuf internals to make this work.
> + * mbuf chain defragmenter. This function moves the data to single mbuf
> + * without changing the mbuf pointer.
>   * The resulting mbuf is not aligned to IP header to assist DMA transfers.
>   */
>  int
> @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how)
>  	KASSERT(m->m_flags & M_PKTHDR);
>  
>  	mbstat_inc(mbs_defrag_alloc);
> -	if ((m0 = m_gethdr(how, m->m_type)) == NULL)
> +
> +	if ((m0 = m_get(how, m->m_type)) == NULL)
>  		return (ENOBUFS);
> -	if (m->m_pkthdr.len > MHLEN) {
> +	if (m->m_pkthdr.len > MLEN) {
>  		MCLGETL(m0, how, m->m_pkthdr.len);
>  		if (!(m0->m_flags & M_EXT)) {
>  			m_free(m0);
> @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how)
>  		}
>  	}
>  	m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
> -	m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
> +	m0->m_len = m->m_pkthdr.len;
>  
> -	/* free chain behind and possible ext buf on the first mbuf */
>  	m_freem(m->m_next);
> -	m->m_next = NULL;
> -	if (m->m_flags & M_EXT)
> -		m_extfree(m);
> -
> -	/*
> -	 * Bounce copy mbuf over to the original mbuf and set everything up.
> -	 * This needs to reset or clear all pointers that may go into the
> -	 * original mbuf chain.
> -	 */
> -	if (m0->m_flags & M_EXT) {
> -		memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
> -		MCLINITREFERENCE(m);
> -		m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
> -		m->m_data = m->m_ext.ext_buf;
> -	} else {
> -		m->m_data = m->m_pktdat;
> -		memcpy(m->m_data, m0->m_data, m0->m_len);
> -	}
> -	m->m_pkthdr.len = m->m_len = m0->m_len;
> -
> -	m0->m_flags &= ~(M_EXT|M_EXTWR);	/* cluster is gone */
> -	m_free(m0);
> +	m->m_data = NULL;
> +	m->m_len = 0;
> +	m->m_next = m0;
>  
>  	return (0);
>  }
> @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le
>  		}
>  		n->m_len = min(len, m->m_len - off);
>  		if (m->m_flags & M_EXT) {
> +			m_extref(m, n);
>  			n->m_data = m->m_data + off;
> -			n->m_ext = m->m_ext;
> -			MCLADDREFERENCE(m, n);
>  		} else {
>  			n->m_data += m->m_data -
>  			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w
>  			return (NULL);
>  	}
>  	if (m->m_flags & M_EXT) {
> -		n->m_ext = m->m_ext;
> -		MCLADDREFERENCE(m, n);
> +		m_extref(m, n);
>  		n->m_data = m->m_data + len;
>  	} else {
>  		m_align(n, remain);
> @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off)
>  void
>  m_zero(struct mbuf *m)
>  {
> -	if (M_READONLY(m)) {
> -		mtx_enter(&m_extref_mtx);
> -		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> -			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> -			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> -		}
> -		mtx_leave(&m_extref_mtx);
> +	struct mbuf *mr;
> +
> +	if (ISSET(m->m_flags, M_EXT) &&
> +	    (mr = m->m_ext.ext_refm) != NULL) {
> +		/*
> +		 * this variable only transitions in one direction,
> +		 * so if there is a race it will be toward the same
> +		 * result and therefore there is no loss.
> +		 */
> +		mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO;
>  		return;
>  	}
>  
> @@ -1525,9 +1533,8 @@ m_print(void *v,
>  		    m->m_ext.ext_buf, m->m_ext.ext_size);
>  		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
>  		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> -		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> -		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> +		(*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n",
> +		    m->m_ext.ext_refm, m->m_ext.ext_refs);
>  	}
>  }
>  
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
> +++ sys/mbuf.h	10 Feb 2026 06:34:04 -0000
> @@ -36,6 +36,7 @@
>  #define _SYS_MBUF_H_
>  
>  #include <sys/queue.h>
> +#include <sys/atomic.h>
>  
>  /*
>   * Constants related to network buffer management.
> @@ -145,8 +146,8 @@ struct mbuf_ext {
>  	void	*ext_arg;
>  	u_int	ext_free_fn;		/* index of free function */
>  	u_int	ext_size;		/* size of buffer, for ext_free_fn */
> -	struct mbuf *ext_nextref;
> -	struct mbuf *ext_prevref;
> +	struct mbuf *ext_refm;		/* mbuf with ext_refs */
> +	u_int	ext_refs;		/* number of refs via ext_refm */
>  #ifdef DEBUG
>  	const char *ext_ofile;
>  	const char *ext_nfile;
> @@ -282,13 +283,23 @@ struct mbuf {
>  #define MCLREFDEBUGO(m, file, line)
>  #endif
>  
> -#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> +	struct mbuf *mr;
> +
> +	mr = m->m_ext.ext_refm;
> +	if (mr == NULL)
> +		return (0);
>  
> -#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
> +	return (atomic_load_int(&mr->m_ext.ext_refs) > 1);
> +}
> +
> +#define	MCLISREFERENCED(m)	m_extreferenced(m)
>  
>  #define	MCLINITREFERENCE(m)	do {					\
> -		(m)->m_ext.ext_prevref = (m);				\
> -		(m)->m_ext.ext_nextref = (m);				\
> +		(m)->m_ext.ext_refm = NULL;				\
> +		(m)->m_ext.ext_refs = 0;				\
>  		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
>  		MCLREFDEBUGN((m), NULL, 0);				\
>  	} while (/* CONSTCOND */ 0)
> @@ -546,8 +557,6 @@ unsigned int		ml_hdatalen(struct mbuf_li
>  /*
>   * mbuf queues
>   */
> -
> -#include <sys/atomic.h>
>  
>  #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \
>      { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 }
> 
>