Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
David Gwynne <david@gwynne.id.au>
Cc:
tech@openbsd.org
Date:
Wed, 25 Feb 2026 23:02:59 +0100

Download raw body.

Thread
  • Alexander Bluhm:

    mbuf cluster m_extref_mtx contention

  • Hi David,
    
    I habe tested your diff again and throughput in my splicing test
    goes from 35 GBit/sec to 65 GBit/sec.  Could you please commit it,
    so I can work on the next bottleneck.
    
    OK bluhm@
    
    On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
    > Index: kern/uipc_mbuf.c
    > ===================================================================
    > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
    > diff -u -p -r1.304 uipc_mbuf.c
    > --- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
    > +++ kern/uipc_mbuf.c	10 Feb 2026 06:34:04 -0000
    > @@ -123,9 +123,8 @@ int max_linkhdr;		/* largest link-level 
    >  int max_protohdr;		/* largest protocol header */
    >  int max_hdr;			/* largest link+protocol header */
    >  
    > -struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
    > +static int	m_extfree(struct mbuf *);
    >  
    > -void	m_extfree(struct mbuf *);
    >  void	m_zero(struct mbuf *);
    >  
    >  unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
    > @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator =
    >  static void (*mextfree_fns[4])(caddr_t, u_int, void *);
    >  static u_int num_extfree_fns;
    >  
    > +#define MEXT_FREE_ZERO	0x80000000
    > +
    >  #define M_DATABUF(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
    >  			(m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
    >  #define M_SIZE(m)	((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
    > @@ -403,6 +404,7 @@ struct mbuf *
    >  m_free(struct mbuf *m)
    >  {
    >  	struct mbuf *n;
    > +	int refs = 0;
    >  	int s;
    >  
    >  	if (m == NULL)
    > @@ -427,51 +429,47 @@ m_free(struct mbuf *m)
    >  #endif	/* NPF > 0 */
    >  	}
    >  	if (m->m_flags & M_EXT)
    > -		m_extfree(m);
    > +		refs = m_extfree(m);
    >  
    > -	pool_put(&mbpool, m);
    > +	if (refs == 0)
    > +		pool_put(&mbpool, m);
    >  
    >  	return (n);
    >  }
    >  
    >  void
    > -m_extref(struct mbuf *o, struct mbuf *n)
    > +m_extref(struct mbuf *m, struct mbuf *n)
    >  {
    > -	int refs = MCLISREFERENCED(o);
    > -
    > -	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
    > +	struct mbuf *mr;
    >  
    > -	if (refs)
    > -		mtx_enter(&m_extref_mtx);
    > -	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
    > -	n->m_ext.ext_prevref = o;
    > -	o->m_ext.ext_nextref = n;
    > -	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
    > -	if (refs)
    > -		mtx_leave(&m_extref_mtx);
    > -
    > -	MCLREFDEBUGN((n), __FILE__, __LINE__);
    > -}
    > +	n->m_flags |= m->m_flags & (M_EXT|M_EXTWR);
    > +	n->m_ext = m->m_ext;
    >  
    > -static inline u_int
    > -m_extunref(struct mbuf *m)
    > -{
    > -	int refs = 0;
    > +	/*
    > +	 * point all the mbufs sharing a cluster at one of them which is
    > +	 * the "leader" via the ext_refm pointer. an mbuf can
    > +	 * identify itself as the leader if ext_refm points at itself.
    > +	 *
    > +	 * the default state for mbufs with clusters is to not set
    > +	 * ext_refm so it can avoid doing the reference counting.
    > +	 *
    > +	 * if an mbuf becomes a leader, it can't be freed until all
    > +	 * the other mbufs referring to it have been freed. similarly,
    > +	 * the cluster can't be removed from a leader until the other
    > +	 * references are dropped.
    > +	 */
    >  
    > -	if (!MCLISREFERENCED(m))
    > -		return (0);
    > +	mr = m->m_ext.ext_refm;
    > +	if (mr == NULL) {
    > +		mr = m;
    > +		m->m_ext.ext_refs = 2;
    > +		m->m_ext.ext_refm = mr; /* mark m as the leader */
    > +	} else
    > +		atomic_inc_int(&mr->m_ext.ext_refs);
    >  
    > -	mtx_enter(&m_extref_mtx);
    > -	if (MCLISREFERENCED(m)) {
    > -		m->m_ext.ext_nextref->m_ext.ext_prevref =
    > -		    m->m_ext.ext_prevref;
    > -		m->m_ext.ext_prevref->m_ext.ext_nextref =
    > -		    m->m_ext.ext_nextref;
    > -		refs = 1;
    > -	}
    > -	mtx_leave(&m_extref_mtx);
    > +	n->m_ext.ext_refm = mr; /* point n at the leader */
    >  
    > -	return (refs);
    > +	MCLREFDEBUGN((n), __FILE__, __LINE__);
    >  }
    >  
    >  /*
    > @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_
    >  	return num_extfree_fns++;
    >  }
    >  
    > -void
    > +static int
    >  m_extfree(struct mbuf *m)
    >  {
    > -	if (m_extunref(m) == 0) {
    > -		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
    > -		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
    > -		    m->m_ext.ext_size, m->m_ext.ext_arg);
    > +	struct mbuf *mr;
    > +	unsigned int free_fn;
    > +	unsigned int zero = 0;
    > +
    > +	mr = m->m_ext.ext_refm;
    > +	if (mr == NULL)
    > +		mr = m;
    > +	else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) {
    > +		/*
    > +		 * the cluster is still referenced by another mbuf.
    > +		 * the caller can't free this mbuf if it is the
    > +		 * leader of a shared cluster.
    > +		 */
    > +		return (m == mr);
    >  	}
    >  
    > -	m->m_flags &= ~(M_EXT|M_EXTWR);
    > +	free_fn = mr->m_ext.ext_free_fn;
    > +	if (ISSET(free_fn, MEXT_FREE_ZERO)) {
    > +		zero = 1;
    > +		CLR(free_fn, MEXT_FREE_ZERO);
    > +	}
    > +
    > +	KASSERTMSG(free_fn < num_extfree_fns,
    > +	    "mbuf %p ext_free_fn %u >= num_extfree_fns %u",
    > +	    mr, free_fn, num_extfree_fns);
    > +
    > +	if (zero)
    > +		explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size);
    > +
    > +	mextfree_fns[free_fn](mr->m_ext.ext_buf,
    > +	    mr->m_ext.ext_size, m->m_ext.ext_arg);
    > +
    > +	if (mr != m)
    > +		pool_put(&mbpool, mr);
    > +
    > +	return (0);
    >  }
    >  
    >  struct mbuf *
    > @@ -524,9 +551,8 @@ m_purge(struct mbuf *m)
    >  }
    >  
    >  /*
    > - * mbuf chain defragmenter. This function uses some evil tricks to defragment
    > - * an mbuf chain into a single buffer without changing the mbuf pointer.
    > - * This needs to know a lot of the mbuf internals to make this work.
    > + * mbuf chain defragmenter. This function moves the data to single mbuf
    > + * without changing the mbuf pointer.
    >   * The resulting mbuf is not aligned to IP header to assist DMA transfers.
    >   */
    >  int
    > @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how)
    >  	KASSERT(m->m_flags & M_PKTHDR);
    >  
    >  	mbstat_inc(mbs_defrag_alloc);
    > -	if ((m0 = m_gethdr(how, m->m_type)) == NULL)
    > +
    > +	if ((m0 = m_get(how, m->m_type)) == NULL)
    >  		return (ENOBUFS);
    > -	if (m->m_pkthdr.len > MHLEN) {
    > +	if (m->m_pkthdr.len > MLEN) {
    >  		MCLGETL(m0, how, m->m_pkthdr.len);
    >  		if (!(m0->m_flags & M_EXT)) {
    >  			m_free(m0);
    > @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how)
    >  		}
    >  	}
    >  	m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
    > -	m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
    > +	m0->m_len = m->m_pkthdr.len;
    >  
    > -	/* free chain behind and possible ext buf on the first mbuf */
    >  	m_freem(m->m_next);
    > -	m->m_next = NULL;
    > -	if (m->m_flags & M_EXT)
    > -		m_extfree(m);
    > -
    > -	/*
    > -	 * Bounce copy mbuf over to the original mbuf and set everything up.
    > -	 * This needs to reset or clear all pointers that may go into the
    > -	 * original mbuf chain.
    > -	 */
    > -	if (m0->m_flags & M_EXT) {
    > -		memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
    > -		MCLINITREFERENCE(m);
    > -		m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
    > -		m->m_data = m->m_ext.ext_buf;
    > -	} else {
    > -		m->m_data = m->m_pktdat;
    > -		memcpy(m->m_data, m0->m_data, m0->m_len);
    > -	}
    > -	m->m_pkthdr.len = m->m_len = m0->m_len;
    > -
    > -	m0->m_flags &= ~(M_EXT|M_EXTWR);	/* cluster is gone */
    > -	m_free(m0);
    > +	m->m_data = NULL;
    > +	m->m_len = 0;
    > +	m->m_next = m0;
    >  
    >  	return (0);
    >  }
    > @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le
    >  		}
    >  		n->m_len = min(len, m->m_len - off);
    >  		if (m->m_flags & M_EXT) {
    > +			m_extref(m, n);
    >  			n->m_data = m->m_data + off;
    > -			n->m_ext = m->m_ext;
    > -			MCLADDREFERENCE(m, n);
    >  		} else {
    >  			n->m_data += m->m_data -
    >  			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
    > @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w
    >  			return (NULL);
    >  	}
    >  	if (m->m_flags & M_EXT) {
    > -		n->m_ext = m->m_ext;
    > -		MCLADDREFERENCE(m, n);
    > +		m_extref(m, n);
    >  		n->m_data = m->m_data + len;
    >  	} else {
    >  		m_align(n, remain);
    > @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off)
    >  void
    >  m_zero(struct mbuf *m)
    >  {
    > -	if (M_READONLY(m)) {
    > -		mtx_enter(&m_extref_mtx);
    > -		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
    > -			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
    > -			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
    > -		}
    > -		mtx_leave(&m_extref_mtx);
    > +	struct mbuf *mr;
    > +
    > +	if (ISSET(m->m_flags, M_EXT) &&
    > +	    (mr = m->m_ext.ext_refm) != NULL) {
    > +		/*
    > +		 * this variable only transitions in one direction,
    > +		 * so if there is a race it will be toward the same
    > +		 * result and therefore there is no loss.
    > +		 */
    > +		mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO;
    >  		return;
    >  	}
    >  
    > @@ -1525,9 +1533,8 @@ m_print(void *v,
    >  		    m->m_ext.ext_buf, m->m_ext.ext_size);
    >  		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
    >  		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
    > -		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
    > -		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
    > -
    > +		(*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n",
    > +		    m->m_ext.ext_refm, m->m_ext.ext_refs);
    >  	}
    >  }
    >  
    > Index: sys/mbuf.h
    > ===================================================================
    > RCS file: /cvs/src/sys/sys/mbuf.h,v
    > diff -u -p -r1.269 mbuf.h
    > --- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
    > +++ sys/mbuf.h	10 Feb 2026 06:34:04 -0000
    > @@ -36,6 +36,7 @@
    >  #define _SYS_MBUF_H_
    >  
    >  #include <sys/queue.h>
    > +#include <sys/atomic.h>
    >  
    >  /*
    >   * Constants related to network buffer management.
    > @@ -145,8 +146,8 @@ struct mbuf_ext {
    >  	void	*ext_arg;
    >  	u_int	ext_free_fn;		/* index of free function */
    >  	u_int	ext_size;		/* size of buffer, for ext_free_fn */
    > -	struct mbuf *ext_nextref;
    > -	struct mbuf *ext_prevref;
    > +	struct mbuf *ext_refm;		/* mbuf with ext_refs */
    > +	u_int	ext_refs;		/* number of refs via ext_refm */
    >  #ifdef DEBUG
    >  	const char *ext_ofile;
    >  	const char *ext_nfile;
    > @@ -282,13 +283,23 @@ struct mbuf {
    >  #define MCLREFDEBUGO(m, file, line)
    >  #endif
    >  
    > -#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
    > +static inline int
    > +m_extreferenced(struct mbuf *m)
    > +{
    > +	struct mbuf *mr;
    > +
    > +	mr = m->m_ext.ext_refm;
    > +	if (mr == NULL)
    > +		return (0);
    >  
    > -#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
    > +	return (atomic_load_int(&mr->m_ext.ext_refs) > 1);
    > +}
    > +
    > +#define	MCLISREFERENCED(m)	m_extreferenced(m)
    >  
    >  #define	MCLINITREFERENCE(m)	do {					\
    > -		(m)->m_ext.ext_prevref = (m);				\
    > -		(m)->m_ext.ext_nextref = (m);				\
    > +		(m)->m_ext.ext_refm = NULL;				\
    > +		(m)->m_ext.ext_refs = 0;				\
    >  		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
    >  		MCLREFDEBUGN((m), NULL, 0);				\
    >  	} while (/* CONSTCOND */ 0)
    > @@ -546,8 +557,6 @@ unsigned int		ml_hdatalen(struct mbuf_li
    >  /*
    >   * mbuf queues
    >   */
    > -
    > -#include <sys/atomic.h>
    >  
    >  #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \
    >      { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 }
    > 
    
    
  • Alexander Bluhm:

    mbuf cluster m_extref_mtx contention