Download raw body.
mbuf cluster m_extref_mtx contention
On Mon, Feb 09, 2026 at 08:56:58AM +0100, Claudio Jeker wrote:
> Can we please just remove this stupid mutex and replace this linked circle
> queue with a lock free implementation? We really don't need yet another
> cache of zombie mbufs.
>
> There are many ways how to do that that all will perform much better.
> There is very little contention on the actuall mbuf headers so some CAS
> based loop should perform very well.
it's not that simple. the mutex also makes it safe to dereference and
modify the memory in sibling mbufs in queue. concurrent m_frees of
mbufs sharing a cluster arent safe unless you provide a mechanism
to account for the references they hold against each other. currently,
participating mbufs have to take the mutex to remove themselves,
which effectively acts as a barrier before an m_free operation.
>
> On Sun, Feb 08, 2026 at 10:32:29AM -0500, Alexander Bluhm wrote:
> > Hi,
> >
> > I some of my tests, especially with socket splicing, I see contention
> > on the m_extref_mtx mutex. It is a global lock to list mbufs that
> > point to the same external cluster.
> >
> > This happens when freeing mbuf chains in the TX path of the driver
> > while the TCP stack still has a reference to the cluster for possible
> > retransmits. This diff avoids to grab the mutex for each m_freem().
> > Instead the mbufs are cached in a per CPU mbuf list. When 50 mbufs
> > have been collected, the mutex is taken once to free all of them.
> >
> > The number 50 is arbitrary, it means at most 4 MB of memory per CPU
> > may be wasted. Should I add an #ifdef MULTIPROCESSOR ?
>
> 4MB per CPU on a system with 80 cpus is 320MB. That is a lot of memory.
> Especially since there is this problem that when the mbuf memory usage
> reaches around 90% of usage the system will fail various socket calls.
>
> > opinions? ok?
i agree that another layer of caching is a bad idea.
the approach freebsd appears to have taken here is reference counting
the shared cluster. one of the mbufs is picked to contain a reference
counter, and the other mbufs pointed at it and use it to coordinate
when the memory can be freed.
the diff below implements this idea. of course, most of the complexity
is in handling the deferred free of the "leader" mbuf. it appears
to work, but i haven't pushed it very hard.
id argue m_zero() has an existing bug where it fiddles with another
mbufs m_flags, but not all m_flags operations are serialised with that
mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
as needing zeroing instead.
the other option i considered was to allocate something on demand to
hold the cluster reference count, and have it proxy the ext_free_fn
call.
struct m_ext_ref {
void *arg;
u_int free_fn;
u_int zero;
struct refcnt refs;
};
void
m_ext_ref_free(caddr_t buf, u_int size, void *arg)
{
struct m_ext_ref *mref = arg;
if (refcnt_rele(&mref->refs)) {
if (mref->zero)
explicit_bzero(buf, size);
mextfree_fns[mref->free_fn](buf, size, mref->arg);
pool_put(&mextrefs, mref);
}
}
m_extref would do something like this:
int
m_extref(struct mbuf *m, struct mbuf *n, int how)
{
struct m_ext_ref *mref;
if (m->m_ext.ext_free_fn = m_ext_ref_free_fn)
mref = m->m_ext.ext_arg;
else {
mref = pool_get(&mextrefs, how);
if (mref == NULL)
return (ENOMEM);
refcnt_init(&mref->refs);
mref->zero = 0;
mref->arg = m->m_ext.ext_arg;
mref->free_fn = m->m_ext.ext_free_fn;
m->m_ext.ext_arg = mref;
m->m_ext.ext_free_fn = m_ext_ref_free_fn;
}
refcnt_take(&mref->refs);
MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
m->m_flags & M_EXTWR, m_ext_ref_free_fn, mref);
return (0);
}
of course the mextrefs pool would have to have per cpu caches enabled
and align entries to cachelines to get the benefit.
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.304 uipc_mbuf.c
--- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
+++ kern/uipc_mbuf.c 10 Feb 2026 06:34:04 -0000
@@ -123,9 +123,8 @@ int max_linkhdr; /* largest link-level
int max_protohdr; /* largest protocol header */
int max_hdr; /* largest link+protocol header */
-struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
+static int m_extfree(struct mbuf *);
-void m_extfree(struct mbuf *);
void m_zero(struct mbuf *);
unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
@@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator =
static void (*mextfree_fns[4])(caddr_t, u_int, void *);
static u_int num_extfree_fns;
+#define MEXT_FREE_ZERO 0x80000000
+
#define M_DATABUF(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
(m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
#define M_SIZE(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
@@ -403,6 +404,7 @@ struct mbuf *
m_free(struct mbuf *m)
{
struct mbuf *n;
+ int refs = 0;
int s;
if (m == NULL)
@@ -427,51 +429,47 @@ m_free(struct mbuf *m)
#endif /* NPF > 0 */
}
if (m->m_flags & M_EXT)
- m_extfree(m);
+ refs = m_extfree(m);
- pool_put(&mbpool, m);
+ if (refs == 0)
+ pool_put(&mbpool, m);
return (n);
}
void
-m_extref(struct mbuf *o, struct mbuf *n)
+m_extref(struct mbuf *m, struct mbuf *n)
{
- int refs = MCLISREFERENCED(o);
-
- n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
+ struct mbuf *mr;
- if (refs)
- mtx_enter(&m_extref_mtx);
- n->m_ext.ext_nextref = o->m_ext.ext_nextref;
- n->m_ext.ext_prevref = o;
- o->m_ext.ext_nextref = n;
- n->m_ext.ext_nextref->m_ext.ext_prevref = n;
- if (refs)
- mtx_leave(&m_extref_mtx);
-
- MCLREFDEBUGN((n), __FILE__, __LINE__);
-}
+ n->m_flags |= m->m_flags & (M_EXT|M_EXTWR);
+ n->m_ext = m->m_ext;
-static inline u_int
-m_extunref(struct mbuf *m)
-{
- int refs = 0;
+ /*
+ * point all the mbufs sharing a cluster at one of them which is
+ * the "leader" via the ext_refm pointer. an mbuf can
+ * identify itself as the leader if ext_refm points at itself.
+ *
+ * the default state for mbufs with clusters is to not set
+ * ext_refm so it can avoid doing the reference counting.
+ *
+ * if an mbuf becomes a leader, it can't be freed until all
+ * the other mbufs referring to it have been freed. similarly,
+ * the cluster can't be removed from a leader until the other
+ * references are dropped.
+ */
- if (!MCLISREFERENCED(m))
- return (0);
+ mr = m->m_ext.ext_refm;
+ if (mr == NULL) {
+ mr = m;
+ m->m_ext.ext_refs = 2;
+ m->m_ext.ext_refm = mr; /* mark m as the leader */
+ } else
+ atomic_inc_int(&mr->m_ext.ext_refs);
- mtx_enter(&m_extref_mtx);
- if (MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_ext.ext_prevref =
- m->m_ext.ext_prevref;
- m->m_ext.ext_prevref->m_ext.ext_nextref =
- m->m_ext.ext_nextref;
- refs = 1;
- }
- mtx_leave(&m_extref_mtx);
+ n->m_ext.ext_refm = mr; /* point n at the leader */
- return (refs);
+ MCLREFDEBUGN((n), __FILE__, __LINE__);
}
/*
@@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_
return num_extfree_fns++;
}
-void
+static int
m_extfree(struct mbuf *m)
{
- if (m_extunref(m) == 0) {
- KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
- mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
- m->m_ext.ext_size, m->m_ext.ext_arg);
+ struct mbuf *mr;
+ unsigned int free_fn;
+ unsigned int zero = 0;
+
+ mr = m->m_ext.ext_refm;
+ if (mr == NULL)
+ mr = m;
+ else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) {
+ /*
+ * the cluster is still referenced by another mbuf.
+ * the caller can't free this mbuf if it is the
+ * leader of a shared cluster.
+ */
+ return (m == mr);
}
- m->m_flags &= ~(M_EXT|M_EXTWR);
+ free_fn = mr->m_ext.ext_free_fn;
+ if (ISSET(free_fn, MEXT_FREE_ZERO)) {
+ zero = 1;
+ CLR(free_fn, MEXT_FREE_ZERO);
+ }
+
+ KASSERTMSG(free_fn < num_extfree_fns,
+ "mbuf %p ext_free_fn %u >= num_extfree_fns %u",
+ mr, free_fn, num_extfree_fns);
+
+ if (zero)
+ explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size);
+
+ mextfree_fns[free_fn](mr->m_ext.ext_buf,
+ mr->m_ext.ext_size, m->m_ext.ext_arg);
+
+ if (mr != m)
+ pool_put(&mbpool, mr);
+
+ return (0);
}
struct mbuf *
@@ -524,9 +551,8 @@ m_purge(struct mbuf *m)
}
/*
- * mbuf chain defragmenter. This function uses some evil tricks to defragment
- * an mbuf chain into a single buffer without changing the mbuf pointer.
- * This needs to know a lot of the mbuf internals to make this work.
+ * mbuf chain defragmenter. This function moves the data to single mbuf
+ * without changing the mbuf pointer.
* The resulting mbuf is not aligned to IP header to assist DMA transfers.
*/
int
@@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how)
KASSERT(m->m_flags & M_PKTHDR);
mbstat_inc(mbs_defrag_alloc);
- if ((m0 = m_gethdr(how, m->m_type)) == NULL)
+
+ if ((m0 = m_get(how, m->m_type)) == NULL)
return (ENOBUFS);
- if (m->m_pkthdr.len > MHLEN) {
+ if (m->m_pkthdr.len > MLEN) {
MCLGETL(m0, how, m->m_pkthdr.len);
if (!(m0->m_flags & M_EXT)) {
m_free(m0);
@@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how)
}
}
m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
- m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
+ m0->m_len = m->m_pkthdr.len;
- /* free chain behind and possible ext buf on the first mbuf */
m_freem(m->m_next);
- m->m_next = NULL;
- if (m->m_flags & M_EXT)
- m_extfree(m);
-
- /*
- * Bounce copy mbuf over to the original mbuf and set everything up.
- * This needs to reset or clear all pointers that may go into the
- * original mbuf chain.
- */
- if (m0->m_flags & M_EXT) {
- memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
- MCLINITREFERENCE(m);
- m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
- m->m_data = m->m_ext.ext_buf;
- } else {
- m->m_data = m->m_pktdat;
- memcpy(m->m_data, m0->m_data, m0->m_len);
- }
- m->m_pkthdr.len = m->m_len = m0->m_len;
-
- m0->m_flags &= ~(M_EXT|M_EXTWR); /* cluster is gone */
- m_free(m0);
+ m->m_data = NULL;
+ m->m_len = 0;
+ m->m_next = m0;
return (0);
}
@@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le
}
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
+ m_extref(m, n);
n->m_data = m->m_data + off;
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
} else {
n->m_data += m->m_data -
(m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
@@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w
return (NULL);
}
if (m->m_flags & M_EXT) {
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
+ m_extref(m, n);
n->m_data = m->m_data + len;
} else {
m_align(n, remain);
@@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off)
void
m_zero(struct mbuf *m)
{
- if (M_READONLY(m)) {
- mtx_enter(&m_extref_mtx);
- if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
- m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
- }
- mtx_leave(&m_extref_mtx);
+ struct mbuf *mr;
+
+ if (ISSET(m->m_flags, M_EXT) &&
+ (mr = m->m_ext.ext_refm) != NULL) {
+ /*
+ * this variable only transitions in one direction,
+ * so if there is a race it will be toward the same
+ * result and therefore there is no loss.
+ */
+ mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO;
return;
}
@@ -1525,9 +1533,8 @@ m_print(void *v,
m->m_ext.ext_buf, m->m_ext.ext_size);
(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
m->m_ext.ext_free_fn, m->m_ext.ext_arg);
- (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
- m->m_ext.ext_nextref, m->m_ext.ext_prevref);
-
+ (*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n",
+ m->m_ext.ext_refm, m->m_ext.ext_refs);
}
}
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.269 mbuf.h
--- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
+++ sys/mbuf.h 10 Feb 2026 06:34:04 -0000
@@ -36,6 +36,7 @@
#define _SYS_MBUF_H_
#include <sys/queue.h>
+#include <sys/atomic.h>
/*
* Constants related to network buffer management.
@@ -145,8 +146,8 @@ struct mbuf_ext {
void *ext_arg;
u_int ext_free_fn; /* index of free function */
u_int ext_size; /* size of buffer, for ext_free_fn */
- struct mbuf *ext_nextref;
- struct mbuf *ext_prevref;
+ struct mbuf *ext_refm; /* mbuf with ext_refs */
+ u_int ext_refs; /* number of refs via ext_refm */
#ifdef DEBUG
const char *ext_ofile;
const char *ext_nfile;
@@ -282,13 +283,23 @@ struct mbuf {
#define MCLREFDEBUGO(m, file, line)
#endif
-#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
+static inline int
+m_extreferenced(struct mbuf *m)
+{
+ struct mbuf *mr;
+
+ mr = m->m_ext.ext_refm;
+ if (mr == NULL)
+ return (0);
-#define MCLADDREFERENCE(o, n) m_extref((o), (n))
+ return (atomic_load_int(&mr->m_ext.ext_refs) > 1);
+}
+
+#define MCLISREFERENCED(m) m_extreferenced(m)
#define MCLINITREFERENCE(m) do { \
- (m)->m_ext.ext_prevref = (m); \
- (m)->m_ext.ext_nextref = (m); \
+ (m)->m_ext.ext_refm = NULL; \
+ (m)->m_ext.ext_refs = 0; \
MCLREFDEBUGO((m), __FILE__, __LINE__); \
MCLREFDEBUGN((m), NULL, 0); \
} while (/* CONSTCOND */ 0)
@@ -546,8 +557,6 @@ unsigned int ml_hdatalen(struct mbuf_li
/*
* mbuf queues
*/
-
-#include <sys/atomic.h>
#define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \
{ MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 }
mbuf cluster m_extref_mtx contention