Download raw body.
mbuf cluster m_extref_mtx contention
On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> the approach freebsd appears to have taken here is reference counting
> the shared cluster. one of the mbufs is picked to contain a reference
> counter, and the other mbufs pointed at it and use it to coordinate
> when the memory can be freed.
>
> the diff below implements this idea. of course, most of the complexity
> is in handling the deferred free of the "leader" mbuf. it appears
> to work, but i haven't pushed it very hard.
Works great for me. The socket splicing test that triggers the
problem goes from 30 to 45 GBit/sec throughput. The contention on
m_extfree is gone.
before:
http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
after:
http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> id argue m_zero() has an existing bug where it fiddles with another
> mbufs m_flags, but not all m_flags operations are serialised with that
> mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> as needing zeroing instead.
Good catch.
> the other option i considered was to allocate something on demand to
> hold the cluster reference count, and have it proxy the ext_free_fn
> call.
Allocating the hot path is not a good idea. We should avoid that.
Diff below is OK bluhm. But I am still testing it on more machines.
bluhm
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
> +++ kern/uipc_mbuf.c 10 Feb 2026 06:34:04 -0000
> @@ -123,9 +123,8 @@ int max_linkhdr; /* largest link-level
> int max_protohdr; /* largest protocol header */
> int max_hdr; /* largest link+protocol header */
>
> -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +static int m_extfree(struct mbuf *);
>
> -void m_extfree(struct mbuf *);
> void m_zero(struct mbuf *);
>
> unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
> @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator =
> static void (*mextfree_fns[4])(caddr_t, u_int, void *);
> static u_int num_extfree_fns;
>
> +#define MEXT_FREE_ZERO 0x80000000
> +
> #define M_DATABUF(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \
> (m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat)
> #define M_SIZE(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \
> @@ -403,6 +404,7 @@ struct mbuf *
> m_free(struct mbuf *m)
> {
> struct mbuf *n;
> + int refs = 0;
> int s;
>
> if (m == NULL)
> @@ -427,51 +429,47 @@ m_free(struct mbuf *m)
> #endif /* NPF > 0 */
> }
> if (m->m_flags & M_EXT)
> - m_extfree(m);
> + refs = m_extfree(m);
>
> - pool_put(&mbpool, m);
> + if (refs == 0)
> + pool_put(&mbpool, m);
>
> return (n);
> }
>
> void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +m_extref(struct mbuf *m, struct mbuf *n)
> {
> - int refs = MCLISREFERENCED(o);
> -
> - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> + struct mbuf *mr;
>
> - if (refs)
> - mtx_enter(&m_extref_mtx);
> - n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> - n->m_ext.ext_prevref = o;
> - o->m_ext.ext_nextref = n;
> - n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> - if (refs)
> - mtx_leave(&m_extref_mtx);
> -
> - MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> + n->m_flags |= m->m_flags & (M_EXT|M_EXTWR);
> + n->m_ext = m->m_ext;
>
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> - int refs = 0;
> + /*
> + * point all the mbufs sharing a cluster at one of them which is
> + * the "leader" via the ext_refm pointer. an mbuf can
> + * identify itself as the leader if ext_refm points at itself.
> + *
> + * the default state for mbufs with clusters is to not set
> + * ext_refm so it can avoid doing the reference counting.
> + *
> + * if an mbuf becomes a leader, it can't be freed until all
> + * the other mbufs referring to it have been freed. similarly,
> + * the cluster can't be removed from a leader until the other
> + * references are dropped.
> + */
>
> - if (!MCLISREFERENCED(m))
> - return (0);
> + mr = m->m_ext.ext_refm;
> + if (mr == NULL) {
> + mr = m;
> + m->m_ext.ext_refs = 2;
> + m->m_ext.ext_refm = mr; /* mark m as the leader */
> + } else
> + atomic_inc_int(&mr->m_ext.ext_refs);
>
> - mtx_enter(&m_extref_mtx);
> - if (MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_ext.ext_prevref =
> - m->m_ext.ext_prevref;
> - m->m_ext.ext_prevref->m_ext.ext_nextref =
> - m->m_ext.ext_nextref;
> - refs = 1;
> - }
> - mtx_leave(&m_extref_mtx);
> + n->m_ext.ext_refm = mr; /* point n at the leader */
>
> - return (refs);
> + MCLREFDEBUGN((n), __FILE__, __LINE__);
> }
>
> /*
> @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_
> return num_extfree_fns++;
> }
>
> -void
> +static int
> m_extfree(struct mbuf *m)
> {
> - if (m_extunref(m) == 0) {
> - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> - m->m_ext.ext_size, m->m_ext.ext_arg);
> + struct mbuf *mr;
> + unsigned int free_fn;
> + unsigned int zero = 0;
> +
> + mr = m->m_ext.ext_refm;
> + if (mr == NULL)
> + mr = m;
> + else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) {
> + /*
> + * the cluster is still referenced by another mbuf.
> + * the caller can't free this mbuf if it is the
> + * leader of a shared cluster.
> + */
> + return (m == mr);
> }
>
> - m->m_flags &= ~(M_EXT|M_EXTWR);
> + free_fn = mr->m_ext.ext_free_fn;
> + if (ISSET(free_fn, MEXT_FREE_ZERO)) {
> + zero = 1;
> + CLR(free_fn, MEXT_FREE_ZERO);
> + }
> +
> + KASSERTMSG(free_fn < num_extfree_fns,
> + "mbuf %p ext_free_fn %u >= num_extfree_fns %u",
> + mr, free_fn, num_extfree_fns);
> +
> + if (zero)
> + explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size);
> +
> + mextfree_fns[free_fn](mr->m_ext.ext_buf,
> + mr->m_ext.ext_size, m->m_ext.ext_arg);
> +
> + if (mr != m)
> + pool_put(&mbpool, mr);
> +
> + return (0);
> }
>
> struct mbuf *
> @@ -524,9 +551,8 @@ m_purge(struct mbuf *m)
> }
>
> /*
> - * mbuf chain defragmenter. This function uses some evil tricks to defragment
> - * an mbuf chain into a single buffer without changing the mbuf pointer.
> - * This needs to know a lot of the mbuf internals to make this work.
> + * mbuf chain defragmenter. This function moves the data to single mbuf
> + * without changing the mbuf pointer.
> * The resulting mbuf is not aligned to IP header to assist DMA transfers.
> */
> int
> @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how)
> KASSERT(m->m_flags & M_PKTHDR);
>
> mbstat_inc(mbs_defrag_alloc);
> - if ((m0 = m_gethdr(how, m->m_type)) == NULL)
> +
> + if ((m0 = m_get(how, m->m_type)) == NULL)
> return (ENOBUFS);
> - if (m->m_pkthdr.len > MHLEN) {
> + if (m->m_pkthdr.len > MLEN) {
> MCLGETL(m0, how, m->m_pkthdr.len);
> if (!(m0->m_flags & M_EXT)) {
> m_free(m0);
> @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how)
> }
> }
> m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t));
> - m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len;
> + m0->m_len = m->m_pkthdr.len;
>
> - /* free chain behind and possible ext buf on the first mbuf */
> m_freem(m->m_next);
> - m->m_next = NULL;
> - if (m->m_flags & M_EXT)
> - m_extfree(m);
> -
> - /*
> - * Bounce copy mbuf over to the original mbuf and set everything up.
> - * This needs to reset or clear all pointers that may go into the
> - * original mbuf chain.
> - */
> - if (m0->m_flags & M_EXT) {
> - memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext));
> - MCLINITREFERENCE(m);
> - m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR);
> - m->m_data = m->m_ext.ext_buf;
> - } else {
> - m->m_data = m->m_pktdat;
> - memcpy(m->m_data, m0->m_data, m0->m_len);
> - }
> - m->m_pkthdr.len = m->m_len = m0->m_len;
> -
> - m0->m_flags &= ~(M_EXT|M_EXTWR); /* cluster is gone */
> - m_free(m0);
> + m->m_data = NULL;
> + m->m_len = 0;
> + m->m_next = m0;
>
> return (0);
> }
> @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le
> }
> n->m_len = min(len, m->m_len - off);
> if (m->m_flags & M_EXT) {
> + m_extref(m, n);
> n->m_data = m->m_data + off;
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> } else {
> n->m_data += m->m_data -
> (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w
> return (NULL);
> }
> if (m->m_flags & M_EXT) {
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> + m_extref(m, n);
> n->m_data = m->m_data + len;
> } else {
> m_align(n, remain);
> @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off)
> void
> m_zero(struct mbuf *m)
> {
> - if (M_READONLY(m)) {
> - mtx_enter(&m_extref_mtx);
> - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> - }
> - mtx_leave(&m_extref_mtx);
> + struct mbuf *mr;
> +
> + if (ISSET(m->m_flags, M_EXT) &&
> + (mr = m->m_ext.ext_refm) != NULL) {
> + /*
> + * this variable only transitions in one direction,
> + * so if there is a race it will be toward the same
> + * result and therefore there is no loss.
> + */
> + mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO;
> return;
> }
>
> @@ -1525,9 +1533,8 @@ m_print(void *v,
> m->m_ext.ext_buf, m->m_ext.ext_size);
> (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
> m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> - m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> + (*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n",
> + m->m_ext.ext_refm, m->m_ext.ext_refs);
> }
> }
>
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
> +++ sys/mbuf.h 10 Feb 2026 06:34:04 -0000
> @@ -36,6 +36,7 @@
> #define _SYS_MBUF_H_
>
> #include <sys/queue.h>
> +#include <sys/atomic.h>
>
> /*
> * Constants related to network buffer management.
> @@ -145,8 +146,8 @@ struct mbuf_ext {
> void *ext_arg;
> u_int ext_free_fn; /* index of free function */
> u_int ext_size; /* size of buffer, for ext_free_fn */
> - struct mbuf *ext_nextref;
> - struct mbuf *ext_prevref;
> + struct mbuf *ext_refm; /* mbuf with ext_refs */
> + u_int ext_refs; /* number of refs via ext_refm */
> #ifdef DEBUG
> const char *ext_ofile;
> const char *ext_nfile;
> @@ -282,13 +283,23 @@ struct mbuf {
> #define MCLREFDEBUGO(m, file, line)
> #endif
>
> -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> + struct mbuf *mr;
> +
> + mr = m->m_ext.ext_refm;
> + if (mr == NULL)
> + return (0);
>
> -#define MCLADDREFERENCE(o, n) m_extref((o), (n))
> + return (atomic_load_int(&mr->m_ext.ext_refs) > 1);
> +}
> +
> +#define MCLISREFERENCED(m) m_extreferenced(m)
>
> #define MCLINITREFERENCE(m) do { \
> - (m)->m_ext.ext_prevref = (m); \
> - (m)->m_ext.ext_nextref = (m); \
> + (m)->m_ext.ext_refm = NULL; \
> + (m)->m_ext.ext_refs = 0; \
> MCLREFDEBUGO((m), __FILE__, __LINE__); \
> MCLREFDEBUGN((m), NULL, 0); \
> } while (/* CONSTCOND */ 0)
> @@ -546,8 +557,6 @@ unsigned int ml_hdatalen(struct mbuf_li
> /*
> * mbuf queues
> */
> -
> -#include <sys/atomic.h>
>
> #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \
> { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 }
>
>
mbuf cluster m_extref_mtx contention