Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
Alexander Bluhm <alexander.bluhm@gmx.net>
Cc:
tech@openbsd.org
Date:
Wed, 11 Feb 2026 12:26:15 +1000

Download raw body.

Thread
On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote:
> On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> > the approach freebsd appears to have taken here is reference counting
> > the shared cluster. one of the mbufs is picked to contain a reference
> > counter, and the other mbufs pointed at it and use it to coordinate
> > when the memory can be freed.
> > 
> > the diff below implements this idea. of course, most of the complexity
> > is in handling the deferred free of the "leader" mbuf. it appears
> > to work, but i haven't pushed it very hard.
> 
> Works great for me.  The socket splicing test that triggers the
> problem goes from 30 to 45 GBit/sec throughput.  The contention on
> m_extfree is gone.
> 
> before:
> http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> after:
> http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> 
> > id argue m_zero() has an existing bug where it fiddles with another
> > mbufs m_flags, but not all m_flags operations are serialised with that
> > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> > as needing zeroing instead.
> 
> Good catch.
> 
> > the other option i considered was to allocate something on demand to
> > hold the cluster reference count, and have it proxy the ext_free_fn
> > call.
> 
> Allocating the hot path is not a good idea.  We should avoid that.

unless it's cheap enough...

> 
> Diff below is OK bluhm.  But I am still testing it on more machines.
> 
> bluhm

this is the proxy ref thing if you want to give it a go. the code is a
lot simpler at least.

Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.304 uipc_mbuf.c
--- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
+++ kern/uipc_mbuf.c	11 Feb 2026 01:29:41 -0000
@@ -123,9 +123,21 @@ int max_linkhdr;		/* largest link-level 
 int max_protohdr;		/* largest protocol header */
 int max_hdr;			/* largest link+protocol header */
 
-struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
+struct m_ext_refs {
+	void		*arg;
+	u_int		 free_fn;
+	u_int		 zero;
+	struct refcnt	 refs;
+};
+
+static struct pool	m_ext_refs_pool;
+
+static void		m_extfree_refs(caddr_t, u_int, void *);
+u_int			m_extfree_refs_fn;
+
+static int		m_extref(struct mbuf *, struct mbuf *, int);
+static void		m_extfree(struct mbuf *);
 
-void	m_extfree(struct mbuf *);
 void	m_zero(struct mbuf *);
 
 unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
@@ -174,6 +186,8 @@ mbinit(void)
 
 	pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
 	    IPL_NET, 0, "mtagpl", NULL);
+	pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
+	    IPL_NET, 0, "mextrefs", NULL);
 
 	for (i = 0; i < nitems(mclsizes); i++) {
 		lowbits = mclsizes[i] & ((1 << 10) - 1);
@@ -193,6 +207,7 @@ mbinit(void)
 
 	(void)mextfree_register(m_extfree_pool);
 	KASSERT(num_extfree_fns == 1);
+	m_extfree_refs_fn = mextfree_register(m_extfree_refs);
 }
 
 void
@@ -204,6 +219,7 @@ mbcpuinit(void)
 
 	pool_cache_init(&mbpool);
 	pool_cache_init(&mtagpool);
+	pool_cache_init(&m_ext_refs_pool);
 
 	for (i = 0; i < nitems(mclsizes); i++)
 		pool_cache_init(&mclpools[i]);
@@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, 
 	pool_put(pp, buf);
 }
 
+int
+m_ext_refs_shared(struct mbuf *m)
+{
+	struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+	return (refcnt_shared(&mrefs->refs));
+}
+
+static void
+m_extfree_refs(caddr_t buf, u_int size, void *arg)
+{
+	struct m_ext_refs *mrefs = arg;
+
+	if (refcnt_rele(&mrefs->refs)) {
+		if (mrefs->zero)
+			explicit_bzero(buf, size);
+
+		KASSERT(mrefs->free_fn < num_extfree_fns);
+		KASSERT(mrefs->free_fn != m_extfree_refs_fn);
+
+		mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
+
+		pool_put(&m_ext_refs_pool, mrefs);
+	}
+}
+
 struct mbuf *
 m_free(struct mbuf *m)
 {
@@ -434,44 +476,33 @@ m_free(struct mbuf *m)
 	return (n);
 }
 
-void
-m_extref(struct mbuf *o, struct mbuf *n)
+static int
+m_extref(struct mbuf *m, struct mbuf *n, int how)
 {
-	int refs = MCLISREFERENCED(o);
-
-	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
+	struct m_ext_refs *mrefs;
 
-	if (refs)
-		mtx_enter(&m_extref_mtx);
-	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
-	n->m_ext.ext_prevref = o;
-	o->m_ext.ext_nextref = n;
-	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
-	if (refs)
-		mtx_leave(&m_extref_mtx);
+	if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
+		mrefs = m->m_ext.ext_arg;
+	else {
+		mrefs = pool_get(&m_ext_refs_pool, how);
+		if (mrefs == NULL)
+			return (ENOMEM);
 
-	MCLREFDEBUGN((n), __FILE__, __LINE__);
-}
+		refcnt_init(&mrefs->refs);
+		mrefs->arg = m->m_ext.ext_arg;
+		mrefs->free_fn = m->m_ext.ext_free_fn;
+		mrefs->zero = 0;
 
-static inline u_int
-m_extunref(struct mbuf *m)
-{
-	int refs = 0;
+		m->m_ext.ext_arg = mrefs;
+		m->m_ext.ext_free_fn = m_extfree_refs_fn;
+	}
 
-	if (!MCLISREFERENCED(m))
-		return (0);
+	refcnt_take(&mrefs->refs);
 
-	mtx_enter(&m_extref_mtx);
-	if (MCLISREFERENCED(m)) {
-		m->m_ext.ext_nextref->m_ext.ext_prevref =
-		    m->m_ext.ext_prevref;
-		m->m_ext.ext_prevref->m_ext.ext_nextref =
-		    m->m_ext.ext_nextref;
-		refs = 1;
-	}
-	mtx_leave(&m_extref_mtx);
+	MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
+	    m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
 
-	return (refs);
+	return (0);
 }
 
 /*
@@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
 	return num_extfree_fns++;
 }
 
-void
+static void
 m_extfree(struct mbuf *m)
 {
-	if (m_extunref(m) == 0) {
-		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
-		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
-		    m->m_ext.ext_size, m->m_ext.ext_arg);
-	}
-
+	KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
+	mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
+	    m->m_ext.ext_size, m->m_ext.ext_arg);
+ 
 	m->m_flags &= ~(M_EXT|M_EXTWR);
 }
 
@@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
+			if (m_extref(m, n, wait) != 0)
+				goto nospace;
 			n->m_data = m->m_data + off;
-			n->m_ext = m->m_ext;
-			MCLADDREFERENCE(m, n);
 		} else {
 			n->m_data += m->m_data -
 			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
@@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
+	olen = m0->m_pkthdr.len;
 	if (m0->m_flags & M_PKTHDR) {
 		MGETHDR(n, wait, m0->m_type);
 		if (n == NULL)
@@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
 			return (NULL);
 		}
 		n->m_pkthdr.len -= len0;
-		olen = m0->m_pkthdr.len;
 		m0->m_pkthdr.len = len0;
 		if (remain == 0) {
 			n->m_next = m->m_next;
@@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
 			return (NULL);
 	}
 	if (m->m_flags & M_EXT) {
-		n->m_ext = m->m_ext;
-		MCLADDREFERENCE(m, n);
+		if (m_extref(m, n, wait) != 0) {
+			m_freem(n);
+			m0->m_pkthdr.len = olen;
+			return (NULL);
+		}
 		n->m_data = m->m_data + len;
 	} else {
 		m_align(n, remain);
@@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
 void
 m_zero(struct mbuf *m)
 {
-	if (M_READONLY(m)) {
-		mtx_enter(&m_extref_mtx);
-		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
-			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
-			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
-		}
-		mtx_leave(&m_extref_mtx);
+	if (ISSET(m->m_flags, M_EXT) &&
+	    m->m_ext.ext_free_fn == m_extfree_refs_fn) {
+		struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+		/*
+		 * this variable only transitions in one direction,
+		 * so if there is a race it will be toward the same
+		 * result and therefore there is no loss.
+		 */
+
+		mrefs->zero = 1;
 		return;
 	}
 
@@ -1525,9 +1561,7 @@ m_print(void *v,
 		    m->m_ext.ext_buf, m->m_ext.ext_size);
 		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
 		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
-		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
-		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
-
+		/* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
 	}
 }
 
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.269 mbuf.h
--- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
+++ sys/mbuf.h	11 Feb 2026 01:29:41 -0000
@@ -145,8 +145,6 @@ struct mbuf_ext {
 	void	*ext_arg;
 	u_int	ext_free_fn;		/* index of free function */
 	u_int	ext_size;		/* size of buffer, for ext_free_fn */
-	struct mbuf *ext_nextref;
-	struct mbuf *ext_prevref;
 #ifdef DEBUG
 	const char *ext_ofile;
 	const char *ext_nfile;
@@ -282,13 +280,22 @@ struct mbuf {
 #define MCLREFDEBUGO(m, file, line)
 #endif
 
-#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
+int	m_ext_refs_shared(struct mbuf *);
 
-#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
+static inline int
+m_extreferenced(struct mbuf *m)
+{
+	extern u_int m_extfree_refs_fn;
+
+	if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
+		return (0);
+
+	return (m_ext_refs_shared(m));
+}
+
+#define	MCLISREFERENCED(m)	m_extreferenced(m)
 
 #define	MCLINITREFERENCE(m)	do {					\
-		(m)->m_ext.ext_prevref = (m);				\
-		(m)->m_ext.ext_nextref = (m);				\
 		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
 		MCLREFDEBUGN((m), NULL, 0);				\
 	} while (/* CONSTCOND */ 0)
@@ -439,7 +446,6 @@ int	m_leadingspace(struct mbuf *);
 int	m_trailingspace(struct mbuf *);
 void	m_align(struct mbuf *, int);
 struct mbuf *m_clget(struct mbuf *, int, u_int);
-void	m_extref(struct mbuf *, struct mbuf *);
 void	m_pool_init(struct pool *, u_int, u_int, const char *);
 u_int	m_pool_used(void);
 void	m_extfree_pool(caddr_t, u_int, void *);