Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Sat, 14 Mar 2026 10:25:33 +1000

Download raw body.

Thread
  • Alexander Bluhm:

    mbuf cluster m_extref_mtx contention

  • On Wed, Feb 25, 2026 at 11:02:59PM +0100, Alexander Bluhm wrote:
    > Hi David,
    > 
    > I habe tested your diff again and throughput in my splicing test
    > goes from 35 GBit/sec to 65 GBit/sec.  Could you please commit it,
    > so I can work on the next bottleneck.
    > 
    > OK bluhm@
    
    im much more comfortable with the proxy version instead of the
    refcount on a leader version.
    
    the leader version adds a constraint where the leader can't detach
    a cluster until all references are dropped. the consequence of this
    is immediately demonstrated in m_defrag, and it required auditing
    the rest of the kernel to make sure that nowhere else removed or
    replaced a cluster on an mbuf. it feels like a big footgun that's
    going to hurt someone in the future.
    
    delaying the actual free of the leader mbuf is ugly too.
    
    i'd prefer to give up a little bit of performance in exchange for
    less dangerous and ugly code.
    
    > 
    > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
    > > Index: kern/uipc_mbuf.c
    > > ===================================================================
    > > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
    > > diff -u -p -r1.304 uipc_mbuf.c
    > > --- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
    > > +++ kern/uipc_mbuf.c	10 Feb 2026 06:34:04 -0000
    > > @@ -123,9 +123,8 @@ int max_linkhdr;		/* largest link-level 
    > >  int max_protohdr;		/* largest protocol header */
    > >  int max_hdr;			/* largest link+protocol header */
    > >  
    > > -struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
    > > +static int	m_extfree(struct mbuf *);
    > >  
    > > -void	m_extfree(struct mbuf *);
    > >  void	m_zero(struct mbuf *);
    > >  
    > >  unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
    > > @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator =
    > >  static void (*mextfree_fns[4])(caddr_t, u_int, void *);
    > >  static u_int num_extfree_fns;
    > >  
    > > +#define MEXT_FREE_ZERO	0x80000000
    > > +
    > >  #define M_DATABUF(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
    > >  			(m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
    > >  #define M_SIZE(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
    > > @@ -403,6 +404,7 @@ struct mbuf *
    > >  m_free(struct mbuf *m)
    > >  {
    > >  	struct mbuf *n;
    > > +	int refs = 0;
    > >  	int s;
    > >  
    > >  	if (m == NULL)
    > > @@ -427,51 +429,47 @@ m_free(struct mbuf *m)
    > >  #endif	/* NPF > 0 */
    > >  	}
    > >  	if (m->m_flags & M_EXT)
    > > -		m_extfree(m);
    > > +		refs = m_extfree(m);
    > >  
    > > -	pool_put(&mbpool, m);
    > > +	if (refs == 0)
    > > +		pool_put(&mbpool, m);
    > >  
    > >  	return (n);
    > >  }
    > >  
    > >  void
    > > -m_extref(struct mbuf *o, struct mbuf *n)
    > > +m_extref(struct mbuf *m, struct mbuf *n)
    > >  {
    > > -	int refs = MCLISREFERENCED(o);
    > > -
    > > -	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
    > > +	struct mbuf *mr;
    > >  
    > > -	if (refs)
    > > -		mtx_enter(&m_extref_mtx);
    > > -	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
    > > -	n->m_ext.ext_prevref = o;
    > > -	o->m_ext.ext_nextref = n;
    > > -	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
    > > -	if (refs)
    > > -		mtx_leave(&m_extref_mtx);
    > > -
    > > -	MCLREFDEBUGN((n), __FILE__, __LINE__);
    > > -}
    > > +	n->m_flags |= m->m_flags & (M_EXT|M_EXTWR);
    > > +	n->m_ext = m->m_ext;
    > >  
    > > -static inline u_int
    > > -m_extunref(struct mbuf *m)
    > > -{
    > > -	int refs = 0;
    > > +	/*
    > > +	 * point all the mbufs sharing a cluster at one of them which is
    > > +	 * the "leader" via the ext_refm pointer. an mbuf can
    > > +	 * identify itself as the leader if ext_refm points at itself.
    > > +	 *
    > > +	 * the default state for mbufs with clusters is to not set
    > > +	 * ext_refm so it can avoid doing the reference counting.
    > > +	 *
    > > +	 * if an mbuf becomes a leader, it can't be freed until all
    > > +	 * the other mbufs referring to it have been freed. similarly,
    > > +	 * the cluster can't be removed from a leader until the other
    > > +	 * references are dropped.
    > > +	 */
    > >  
    > > -	if (!MCLISREFERENCED(m))
    > > -		return (0);
    > > +	mr = m->m_ext.ext_refm;
    > > +	if (mr == NULL) {
    > > +		mr = m;
    > > +		m->m_ext.ext_refs = 2;
    > > +		m->m_ext.ext_refm = mr; /* mark m as the leader */
    > > +	} else
    > > +		atomic_inc_int(&mr->m_ext.ext_refs);
    > >  
    > > -	mtx_enter(&m_extref_mtx);
    > > -	if (MCLISREFERENCED(m)) {
    > > -		m->m_ext.ext_nextref->m_ext.ext_prevref =
    > > -		    m->m_ext.ext_prevref;
    > > -		m->m_ext.ext_prevref->m_ext.ext_nextref =
    > > -		    m->m_ext.ext_nextref;
    > > -		refs = 1;
    > > -	}
    > > -	mtx_leave(&m_extref_mtx);
    > > +	n->m_ext.ext_refm = mr; /* point n at the leader */
    > >  
    > > -	return (refs);
    > > +	MCLREFDEBUGN((n), __FILE__, __LINE__);
    > >  }
    > >  
    > >  /*
    > > @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_
    > >  	return num_extfree_fns++;
    > >  }
    > >  
    > > -void
    > > +static int
    > >  m_extfree(struct mbuf *m)
    > >  {
    > > -	if (m_extunref(m) == 0) {
    > > -		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
    > > -		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
    > > -		    m->m_ext.ext_size, m->m_ext.ext_arg);
    > > +	struct mbuf *mr;
    > > +	unsigned int free_fn;
    > > +	unsigned int zero = 0;
    > > +
    > > +	mr = m->m_ext.ext_refm;
    > > +	if (mr == NULL)
    > > +		mr = m;
    > > +	else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) {
    > > +		/*
    > > +		 * the cluster is still referenced by another mbuf.
    > > +		 * the caller can't free this mbuf if it is the
    > > +		 * leader of a shared cluster.
    > > +		 */
    > > +		return (m == mr);
    > >  	}
    > >  
    > > -	m->m_flags &= ~(M_EXT|M_EXTWR);
    > > +	free_fn = mr->m_ext.ext_free_fn;
    > > +	if (ISSET(free_fn, MEXT_FREE_ZERO)) {
    > > +		zero = 1;
    > > +		CLR(free_fn, MEXT_FREE_ZERO);
    > > +	}
    > > +
    > > +	KASSERTMSG(free_fn < num_extfree_fns,
    > > +	    "mbuf %p ext_free_fn %u >= num_extfree_fns %u",
    > > +	    mr, free_fn, num_extfree_fns);
    > > +
    > > +	if (zero)
    > > +		explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size);
    > > +
    > > +	mextfree_fns[free_fn](mr->m_ext.ext_buf,
    > > +	    mr->m_ext.ext_size, m->m_ext.ext_arg);
    > > +
    > > +	if (mr != m)
    > > +		pool_put(&mbpool, mr);
    > > +
    > > +	return (0);
    > >  }
    > >  
    > >  struct mbuf *
    > > @@ -524,9 +551,8 @@ m_purge(struct mbuf *m)
    > >  }
    > >  
    > >  /*
    > > - * mbuf chain defragmenter. This function uses some evil tricks to defragment
    > > - * an mbuf chain into a single buffer without changing the mbuf pointer.
    > > - * This needs to know a lot of the mbuf internals to make this work.
    > > + * mbuf chain defragmenter. This function moves the data to single mbuf
    > > + * without changing the mbuf pointer.
    > >   * The resulting mbuf is not aligned to IP header to assist DMA transfers.
    > >   */
    > >  int
    > > @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how)
    > >  	KASSERT(m->m_flags & M_PKTHDR);
    > >  
    > >  	mbstat_inc(mbs_defrag_alloc);
    > > -	if ((m0 = m_gethdr(how, m->m_type)) == NULL)
    > > +
    > > +	if ((m0 = m_get(how, m->m_type)) == NULL)
    > >  		return (ENOBUFS);
    > > -	if (m->m_pkthdr.len > MHLEN) {
    > > +	if (m->m_pkthdr.len > MLEN) {
    > >  		MCLGETL(m0, how, m->m_pkthdr.len);
    > >  		if (!(m0->m_flags & M_EXT)) {
    > >  			m_free(m0);
    > > @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how)
    > >  		}
    > >  	}
    > >  	m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
    > > -	m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
    > > +	m0->m_len = m->m_pkthdr.len;
    > >  
    > > -	/* free chain behind and possible ext buf on the first mbuf */
    > >  	m_freem(m->m_next);
    > > -	m->m_next = NULL;
    > > -	if (m->m_flags & M_EXT)
    > > -		m_extfree(m);
    > > -
    > > -	/*
    > > -	 * Bounce copy mbuf over to the original mbuf and set everything up.
    > > -	 * This needs to reset or clear all pointers that may go into the
    > > -	 * original mbuf chain.
    > > -	 */
    > > -	if (m0->m_flags & M_EXT) {
    > > -		memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
    > > -		MCLINITREFERENCE(m);
    > > -		m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
    > > -		m->m_data = m->m_ext.ext_buf;
    > > -	} else {
    > > -		m->m_data = m->m_pktdat;
    > > -		memcpy(m->m_data, m0->m_data, m0->m_len);
    > > -	}
    > > -	m->m_pkthdr.len = m->m_len = m0->m_len;
    > > -
    > > -	m0->m_flags &= ~(M_EXT|M_EXTWR);	/* cluster is gone */
    > > -	m_free(m0);
    > > +	m->m_data = NULL;
    > > +	m->m_len = 0;
    > > +	m->m_next = m0;
    > >  
    > >  	return (0);
    > >  }
    > > @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le
    > >  		}
    > >  		n->m_len = min(len, m->m_len - off);
    > >  		if (m->m_flags & M_EXT) {
    > > +			m_extref(m, n);
    > >  			n->m_data = m->m_data + off;
    > > -			n->m_ext = m->m_ext;
    > > -			MCLADDREFERENCE(m, n);
    > >  		} else {
    > >  			n->m_data += m->m_data -
    > >  			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
    > > @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w
    > >  			return (NULL);
    > >  	}
    > >  	if (m->m_flags & M_EXT) {
    > > -		n->m_ext = m->m_ext;
    > > -		MCLADDREFERENCE(m, n);
    > > +		m_extref(m, n);
    > >  		n->m_data = m->m_data + len;
    > >  	} else {
    > >  		m_align(n, remain);
    > > @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off)
    > >  void
    > >  m_zero(struct mbuf *m)
    > >  {
    > > -	if (M_READONLY(m)) {
    > > -		mtx_enter(&m_extref_mtx);
    > > -		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
    > > -			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
    > > -			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
    > > -		}
    > > -		mtx_leave(&m_extref_mtx);
    > > +	struct mbuf *mr;
    > > +
    > > +	if (ISSET(m->m_flags, M_EXT) &&
    > > +	    (mr = m->m_ext.ext_refm) != NULL) {
    > > +		/*
    > > +		 * this variable only transitions in one direction,
    > > +		 * so if there is a race it will be toward the same
    > > +		 * result and therefore there is no loss.
    > > +		 */
    > > +		mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO;
    > >  		return;
    > >  	}
    > >  
    > > @@ -1525,9 +1533,8 @@ m_print(void *v,
    > >  		    m->m_ext.ext_buf, m->m_ext.ext_size);
    > >  		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
    > >  		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
    > > -		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
    > > -		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
    > > -
    > > +		(*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n",
    > > +		    m->m_ext.ext_refm, m->m_ext.ext_refs);
    > >  	}
    > >  }
    > >  
    > > Index: sys/mbuf.h
    > > ===================================================================
    > > RCS file: /cvs/src/sys/sys/mbuf.h,v
    > > diff -u -p -r1.269 mbuf.h
    > > --- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
    > > +++ sys/mbuf.h	10 Feb 2026 06:34:04 -0000
    > > @@ -36,6 +36,7 @@
    > >  #define _SYS_MBUF_H_
    > >  
    > >  #include <sys/queue.h>
    > > +#include <sys/atomic.h>
    > >  
    > >  /*
    > >   * Constants related to network buffer management.
    > > @@ -145,8 +146,8 @@ struct mbuf_ext {
    > >  	void	*ext_arg;
    > >  	u_int	ext_free_fn;		/* index of free function */
    > >  	u_int	ext_size;		/* size of buffer, for ext_free_fn */
    > > -	struct mbuf *ext_nextref;
    > > -	struct mbuf *ext_prevref;
    > > +	struct mbuf *ext_refm;		/* mbuf with ext_refs */
    > > +	u_int	ext_refs;		/* number of refs via ext_refm */
    > >  #ifdef DEBUG
    > >  	const char *ext_ofile;
    > >  	const char *ext_nfile;
    > > @@ -282,13 +283,23 @@ struct mbuf {
    > >  #define MCLREFDEBUGO(m, file, line)
    > >  #endif
    > >  
    > > -#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
    > > +static inline int
    > > +m_extreferenced(struct mbuf *m)
    > > +{
    > > +	struct mbuf *mr;
    > > +
    > > +	mr = m->m_ext.ext_refm;
    > > +	if (mr == NULL)
    > > +		return (0);
    > >  
    > > -#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
    > > +	return (atomic_load_int(&mr->m_ext.ext_refs) > 1);
    > > +}
    > > +
    > > +#define	MCLISREFERENCED(m)	m_extreferenced(m)
    > >  
    > >  #define	MCLINITREFERENCE(m)	do {					\
    > > -		(m)->m_ext.ext_prevref = (m);				\
    > > -		(m)->m_ext.ext_nextref = (m);				\
    > > +		(m)->m_ext.ext_refm = NULL;				\
    > > +		(m)->m_ext.ext_refs = 0;				\
    > >  		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
    > >  		MCLREFDEBUGN((m), NULL, 0);				\
    > >  	} while (/* CONSTCOND */ 0)
    > > @@ -546,8 +557,6 @@ unsigned int		ml_hdatalen(struct mbuf_li
    > >  /*
    > >   * mbuf queues
    > >   */
    > > -
    > > -#include <sys/atomic.h>
    > >  
    > >  #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \
    > >      { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 }
    > > 
    
    
  • Alexander Bluhm:

    mbuf cluster m_extref_mtx contention