Download raw body.
mbuf cluster m_extref_mtx contention
On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote:
> > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote:
> > > the approach freebsd appears to have taken here is reference counting
> > > the shared cluster. one of the mbufs is picked to contain a reference
> > > counter, and the other mbufs pointed at it and use it to coordinate
> > > when the memory can be freed.
> > >
> > > the diff below implements this idea. of course, most of the complexity
> > > is in handling the deferred free of the "leader" mbuf. it appears
> > > to work, but i haven't pushed it very hard.
> >
> > Works great for me. The socket splicing test that triggers the
> > problem goes from 30 to 45 GBit/sec throughput. The contention on
> > m_extfree is gone.
> >
> > before:
> > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> > after:
> > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree
> >
> > > id argue m_zero() has an existing bug where it fiddles with another
> > > mbufs m_flags, but not all m_flags operations are serialised with that
> > > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster
> > > as needing zeroing instead.
> >
> > Good catch.
> >
> > > the other option i considered was to allocate something on demand to
> > > hold the cluster reference count, and have it proxy the ext_free_fn
> > > call.
> >
> > Allocating the hot path is not a good idea. We should avoid that.
>
> unless it's cheap enough...
>
> >
> > Diff below is OK bluhm. But I am still testing it on more machines.
> >
> > bluhm
>
> this is the proxy ref thing if you want to give it a go. the code is a
> lot simpler at least.
It also works and performance is simmilar. But I like the other
diff without extra allocations much more.
All numbers are here, but the page is a bit messy.
http://bluhm.genua.de/netlink/results/2026-02-10T01:48:41Z/netlink.html
bluhm
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.304 uipc_mbuf.c
> --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
> +++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000
> @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level
> int max_protohdr; /* largest protocol header */
> int max_hdr; /* largest link+protocol header */
>
> -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +struct m_ext_refs {
> + void *arg;
> + u_int free_fn;
> + u_int zero;
> + struct refcnt refs;
> +};
> +
> +static struct pool m_ext_refs_pool;
> +
> +static void m_extfree_refs(caddr_t, u_int, void *);
> +u_int m_extfree_refs_fn;
> +
> +static int m_extref(struct mbuf *, struct mbuf *, int);
> +static void m_extfree(struct mbuf *);
>
> -void m_extfree(struct mbuf *);
> void m_zero(struct mbuf *);
>
> unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
> @@ -174,6 +186,8 @@ mbinit(void)
>
> pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
> IPL_NET, 0, "mtagpl", NULL);
> + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
> + IPL_NET, 0, "mextrefs", NULL);
>
> for (i = 0; i < nitems(mclsizes); i++) {
> lowbits = mclsizes[i] & ((1 << 10) - 1);
> @@ -193,6 +207,7 @@ mbinit(void)
>
> (void)mextfree_register(m_extfree_pool);
> KASSERT(num_extfree_fns == 1);
> + m_extfree_refs_fn = mextfree_register(m_extfree_refs);
> }
>
> void
> @@ -204,6 +219,7 @@ mbcpuinit(void)
>
> pool_cache_init(&mbpool);
> pool_cache_init(&mtagpool);
> + pool_cache_init(&m_ext_refs_pool);
>
> for (i = 0; i < nitems(mclsizes); i++)
> pool_cache_init(&mclpools[i]);
> @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size,
> pool_put(pp, buf);
> }
>
> +int
> +m_ext_refs_shared(struct mbuf *m)
> +{
> + struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> + return (refcnt_shared(&mrefs->refs));
> +}
> +
> +static void
> +m_extfree_refs(caddr_t buf, u_int size, void *arg)
> +{
> + struct m_ext_refs *mrefs = arg;
> +
> + if (refcnt_rele(&mrefs->refs)) {
> + if (mrefs->zero)
> + explicit_bzero(buf, size);
> +
> + KASSERT(mrefs->free_fn < num_extfree_fns);
> + KASSERT(mrefs->free_fn != m_extfree_refs_fn);
> +
> + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
> +
> + pool_put(&m_ext_refs_pool, mrefs);
> + }
> +}
> +
> struct mbuf *
> m_free(struct mbuf *m)
> {
> @@ -434,44 +476,33 @@ m_free(struct mbuf *m)
> return (n);
> }
>
> -void
> -m_extref(struct mbuf *o, struct mbuf *n)
> +static int
> +m_extref(struct mbuf *m, struct mbuf *n, int how)
> {
> - int refs = MCLISREFERENCED(o);
> -
> - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
> + struct m_ext_refs *mrefs;
>
> - if (refs)
> - mtx_enter(&m_extref_mtx);
> - n->m_ext.ext_nextref = o->m_ext.ext_nextref;
> - n->m_ext.ext_prevref = o;
> - o->m_ext.ext_nextref = n;
> - n->m_ext.ext_nextref->m_ext.ext_prevref = n;
> - if (refs)
> - mtx_leave(&m_extref_mtx);
> + if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
> + mrefs = m->m_ext.ext_arg;
> + else {
> + mrefs = pool_get(&m_ext_refs_pool, how);
> + if (mrefs == NULL)
> + return (ENOMEM);
>
> - MCLREFDEBUGN((n), __FILE__, __LINE__);
> -}
> + refcnt_init(&mrefs->refs);
> + mrefs->arg = m->m_ext.ext_arg;
> + mrefs->free_fn = m->m_ext.ext_free_fn;
> + mrefs->zero = 0;
>
> -static inline u_int
> -m_extunref(struct mbuf *m)
> -{
> - int refs = 0;
> + m->m_ext.ext_arg = mrefs;
> + m->m_ext.ext_free_fn = m_extfree_refs_fn;
> + }
>
> - if (!MCLISREFERENCED(m))
> - return (0);
> + refcnt_take(&mrefs->refs);
>
> - mtx_enter(&m_extref_mtx);
> - if (MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_ext.ext_prevref =
> - m->m_ext.ext_prevref;
> - m->m_ext.ext_prevref->m_ext.ext_nextref =
> - m->m_ext.ext_nextref;
> - refs = 1;
> - }
> - mtx_leave(&m_extref_mtx);
> + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
> + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
>
> - return (refs);
> + return (0);
> }
>
> /*
> @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
> return num_extfree_fns++;
> }
>
> -void
> +static void
> m_extfree(struct mbuf *m)
> {
> - if (m_extunref(m) == 0) {
> - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> - m->m_ext.ext_size, m->m_ext.ext_arg);
> - }
> -
> + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
> + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
> + m->m_ext.ext_size, m->m_ext.ext_arg);
> +
> m->m_flags &= ~(M_EXT|M_EXTWR);
> }
>
> @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
> }
> n->m_len = min(len, m->m_len - off);
> if (m->m_flags & M_EXT) {
> + if (m_extref(m, n, wait) != 0)
> + goto nospace;
> n->m_data = m->m_data + off;
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> } else {
> n->m_data += m->m_data -
> (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
> @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w
> if (m == NULL)
> return (NULL);
> remain = m->m_len - len;
> + olen = m0->m_pkthdr.len;
> if (m0->m_flags & M_PKTHDR) {
> MGETHDR(n, wait, m0->m_type);
> if (n == NULL)
> @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w
> return (NULL);
> }
> n->m_pkthdr.len -= len0;
> - olen = m0->m_pkthdr.len;
> m0->m_pkthdr.len = len0;
> if (remain == 0) {
> n->m_next = m->m_next;
> @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w
> return (NULL);
> }
> if (m->m_flags & M_EXT) {
> - n->m_ext = m->m_ext;
> - MCLADDREFERENCE(m, n);
> + if (m_extref(m, n, wait) != 0) {
> + m_freem(n);
> + m0->m_pkthdr.len = olen;
> + return (NULL);
> + }
> n->m_data = m->m_data + len;
> } else {
> m_align(n, remain);
> @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off)
> void
> m_zero(struct mbuf *m)
> {
> - if (M_READONLY(m)) {
> - mtx_enter(&m_extref_mtx);
> - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
> - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
> - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
> - }
> - mtx_leave(&m_extref_mtx);
> + if (ISSET(m->m_flags, M_EXT) &&
> + m->m_ext.ext_free_fn == m_extfree_refs_fn) {
> + struct m_ext_refs *mrefs = m->m_ext.ext_arg;
> +
> + /*
> + * this variable only transitions in one direction,
> + * so if there is a race it will be toward the same
> + * result and therefore there is no loss.
> + */
> +
> + mrefs->zero = 1;
> return;
> }
>
> @@ -1525,9 +1561,7 @@ m_print(void *v,
> m->m_ext.ext_buf, m->m_ext.ext_size);
> (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
> m->m_ext.ext_free_fn, m->m_ext.ext_arg);
> - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
> - m->m_ext.ext_nextref, m->m_ext.ext_prevref);
> -
> + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
> }
> }
>
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> diff -u -p -r1.269 mbuf.h
> --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
> +++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000
> @@ -145,8 +145,6 @@ struct mbuf_ext {
> void *ext_arg;
> u_int ext_free_fn; /* index of free function */
> u_int ext_size; /* size of buffer, for ext_free_fn */
> - struct mbuf *ext_nextref;
> - struct mbuf *ext_prevref;
> #ifdef DEBUG
> const char *ext_ofile;
> const char *ext_nfile;
> @@ -282,13 +280,22 @@ struct mbuf {
> #define MCLREFDEBUGO(m, file, line)
> #endif
>
> -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
> +int m_ext_refs_shared(struct mbuf *);
>
> -#define MCLADDREFERENCE(o, n) m_extref((o), (n))
> +static inline int
> +m_extreferenced(struct mbuf *m)
> +{
> + extern u_int m_extfree_refs_fn;
> +
> + if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
> + return (0);
> +
> + return (m_ext_refs_shared(m));
> +}
> +
> +#define MCLISREFERENCED(m) m_extreferenced(m)
>
> #define MCLINITREFERENCE(m) do { \
> - (m)->m_ext.ext_prevref = (m); \
> - (m)->m_ext.ext_nextref = (m); \
> MCLREFDEBUGO((m), __FILE__, __LINE__); \
> MCLREFDEBUGN((m), NULL, 0); \
> } while (/* CONSTCOND */ 0)
> @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *);
> int m_trailingspace(struct mbuf *);
> void m_align(struct mbuf *, int);
> struct mbuf *m_clget(struct mbuf *, int, u_int);
> -void m_extref(struct mbuf *, struct mbuf *);
> void m_pool_init(struct pool *, u_int, u_int, const char *);
> u_int m_pool_used(void);
> void m_extfree_pool(caddr_t, u_int, void *);
>
>
mbuf cluster m_extref_mtx contention