Download raw body.
mbuf cluster m_extref_mtx contention
On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote:
> On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> > the approach freebsd appears to have taken here is reference counting
> > the shared cluster. one of the mbufs is picked to contain a reference
> > counter, and the other mbufs pointed at it and use it to coordinate
> > when the memory can be freed.
> >
> > the diff below implements this idea. of course, most of the complexity
> > is in handling the deferred free of the "leader" mbuf. it appears
> > to work, but i haven't pushed it very hard.
>
> Works great for me. The socket splicing test that triggers the
> problem goes from 30 to 45 GBit/sec throughput. The contention on
> m_extfree is gone.
>
> before:
> http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> after:
> http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
>
> > id argue m_zero() has an existing bug where it fiddles with another
> > mbufs m_flags, but not all m_flags operations are serialised with that
> > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> > as needing zeroing instead.
>
> Good catch.
>
> > the other option i considered was to allocate something on demand to
> > hold the cluster reference count, and have it proxy the ext_free_fn
> > call.
>
> Allocating the hot path is not a good idea. We should avoid that.
unless it's cheap enough...
>
> Diff below is OK bluhm. But I am still testing it on more machines.
>
> bluhm
this is the proxy ref thing if you want to give it a go. the code is a
lot simpler at least.
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.304 uipc_mbuf.c
--- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
+++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000
@@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level
int max_protohdr; /* largest protocol header */
int max_hdr; /* largest link+protocol header */
-struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
+struct m_ext_refs {
+ void *arg;
+ u_int free_fn;
+ u_int zero;
+ struct refcnt refs;
+};
+
+static struct pool m_ext_refs_pool;
+
+static void m_extfree_refs(caddr_t, u_int, void *);
+u_int m_extfree_refs_fn;
+
+static int m_extref(struct mbuf *, struct mbuf *, int);
+static void m_extfree(struct mbuf *);
-void m_extfree(struct mbuf *);
void m_zero(struct mbuf *);
unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
@@ -174,6 +186,8 @@ mbinit(void)
pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
IPL_NET, 0, "mtagpl", NULL);
+ pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
+ IPL_NET, 0, "mextrefs", NULL);
for (i = 0; i < nitems(mclsizes); i++) {
lowbits = mclsizes[i] & ((1 << 10) - 1);
@@ -193,6 +207,7 @@ mbinit(void)
(void)mextfree_register(m_extfree_pool);
KASSERT(num_extfree_fns == 1);
+ m_extfree_refs_fn = mextfree_register(m_extfree_refs);
}
void
@@ -204,6 +219,7 @@ mbcpuinit(void)
pool_cache_init(&mbpool);
pool_cache_init(&mtagpool);
+ pool_cache_init(&m_ext_refs_pool);
for (i = 0; i < nitems(mclsizes); i++)
pool_cache_init(&mclpools[i]);
@@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size,
pool_put(pp, buf);
}
+int
+m_ext_refs_shared(struct mbuf *m)
+{
+ struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+ return (refcnt_shared(&mrefs->refs));
+}
+
+static void
+m_extfree_refs(caddr_t buf, u_int size, void *arg)
+{
+ struct m_ext_refs *mrefs = arg;
+
+ if (refcnt_rele(&mrefs->refs)) {
+ if (mrefs->zero)
+ explicit_bzero(buf, size);
+
+ KASSERT(mrefs->free_fn < num_extfree_fns);
+ KASSERT(mrefs->free_fn != m_extfree_refs_fn);
+
+ mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
+
+ pool_put(&m_ext_refs_pool, mrefs);
+ }
+}
+
struct mbuf *
m_free(struct mbuf *m)
{
@@ -434,44 +476,33 @@ m_free(struct mbuf *m)
return (n);
}
-void
-m_extref(struct mbuf *o, struct mbuf *n)
+static int
+m_extref(struct mbuf *m, struct mbuf *n, int how)
{
- int refs = MCLISREFERENCED(o);
-
- n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
+ struct m_ext_refs *mrefs;
- if (refs)
- mtx_enter(&m_extref_mtx);
- n->m_ext.ext_nextref = o->m_ext.ext_nextref;
- n->m_ext.ext_prevref = o;
- o->m_ext.ext_nextref = n;
- n->m_ext.ext_nextref->m_ext.ext_prevref = n;
- if (refs)
- mtx_leave(&m_extref_mtx);
+ if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
+ mrefs = m->m_ext.ext_arg;
+ else {
+ mrefs = pool_get(&m_ext_refs_pool, how);
+ if (mrefs == NULL)
+ return (ENOMEM);
- MCLREFDEBUGN((n), __FILE__, __LINE__);
-}
+ refcnt_init(&mrefs->refs);
+ mrefs->arg = m->m_ext.ext_arg;
+ mrefs->free_fn = m->m_ext.ext_free_fn;
+ mrefs->zero = 0;
-static inline u_int
-m_extunref(struct mbuf *m)
-{
- int refs = 0;
+ m->m_ext.ext_arg = mrefs;
+ m->m_ext.ext_free_fn = m_extfree_refs_fn;
+ }
- if (!MCLISREFERENCED(m))
- return (0);
+ refcnt_take(&mrefs->refs);
- mtx_enter(&m_extref_mtx);
- if (MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_ext.ext_prevref =
- m->m_ext.ext_prevref;
- m->m_ext.ext_prevref->m_ext.ext_nextref =
- m->m_ext.ext_nextref;
- refs = 1;
- }
- mtx_leave(&m_extref_mtx);
+ MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
+ m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
- return (refs);
+ return (0);
}
/*
@@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
return num_extfree_fns++;
}
-void
+static void
m_extfree(struct mbuf *m)
{
- if (m_extunref(m) == 0) {
- KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
- mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
- m->m_ext.ext_size, m->m_ext.ext_arg);
- }
-
+ KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
+ mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
+ m->m_ext.ext_size, m->m_ext.ext_arg);
+
m->m_flags &= ~(M_EXT|M_EXTWR);
}
@@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
}
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
+ if (m_extref(m, n, wait) != 0)
+ goto nospace;
n->m_data = m->m_data + off;
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
} else {
n->m_data += m->m_data -
(m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
@@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
if (m == NULL)
return (NULL);
remain = m->m_len - len;
+ olen = m0->m_pkthdr.len;
if (m0->m_flags & M_PKTHDR) {
MGETHDR(n, wait, m0->m_type);
if (n == NULL)
@@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
return (NULL);
}
n->m_pkthdr.len -= len0;
- olen = m0->m_pkthdr.len;
m0->m_pkthdr.len = len0;
if (remain == 0) {
n->m_next = m->m_next;
@@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
return (NULL);
}
if (m->m_flags & M_EXT) {
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
+ if (m_extref(m, n, wait) != 0) {
+ m_freem(n);
+ m0->m_pkthdr.len = olen;
+ return (NULL);
+ }
n->m_data = m->m_data + len;
} else {
m_align(n, remain);
@@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
void
m_zero(struct mbuf *m)
{
- if (M_READONLY(m)) {
- mtx_enter(&m_extref_mtx);
- if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
- m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
- }
- mtx_leave(&m_extref_mtx);
+ if (ISSET(m->m_flags, M_EXT) &&
+ m->m_ext.ext_free_fn == m_extfree_refs_fn) {
+ struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+ /*
+ * this variable only transitions in one direction,
+ * so if there is a race it will be toward the same
+ * result and therefore there is no loss.
+ */
+
+ mrefs->zero = 1;
return;
}
@@ -1525,9 +1561,7 @@ m_print(void *v,
m->m_ext.ext_buf, m->m_ext.ext_size);
(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
m->m_ext.ext_free_fn, m->m_ext.ext_arg);
- (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
- m->m_ext.ext_nextref, m->m_ext.ext_prevref);
-
+ /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
}
}
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.269 mbuf.h
--- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
+++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000
@@ -145,8 +145,6 @@ struct mbuf_ext {
void *ext_arg;
u_int ext_free_fn; /* index of free function */
u_int ext_size; /* size of buffer, for ext_free_fn */
- struct mbuf *ext_nextref;
- struct mbuf *ext_prevref;
#ifdef DEBUG
const char *ext_ofile;
const char *ext_nfile;
@@ -282,13 +280,22 @@ struct mbuf {
#define MCLREFDEBUGO(m, file, line)
#endif
-#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
+int m_ext_refs_shared(struct mbuf *);
-#define MCLADDREFERENCE(o, n) m_extref((o), (n))
+static inline int
+m_extreferenced(struct mbuf *m)
+{
+ extern u_int m_extfree_refs_fn;
+
+ if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
+ return (0);
+
+ return (m_ext_refs_shared(m));
+}
+
+#define MCLISREFERENCED(m) m_extreferenced(m)
#define MCLINITREFERENCE(m) do { \
- (m)->m_ext.ext_prevref = (m); \
- (m)->m_ext.ext_nextref = (m); \
MCLREFDEBUGO((m), __FILE__, __LINE__); \
MCLREFDEBUGN((m), NULL, 0); \
} while (/* CONSTCOND */ 0)
@@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *);
int m_trailingspace(struct mbuf *);
void m_align(struct mbuf *, int);
struct mbuf *m_clget(struct mbuf *, int, u_int);
-void m_extref(struct mbuf *, struct mbuf *);
void m_pool_init(struct pool *, u_int, u_int, const char *);
u_int m_pool_used(void);
void m_extfree_pool(caddr_t, u_int, void *);
mbuf cluster m_extref_mtx contention