From: Alexander Bluhm Subject: Re: mbuf cluster m_extref_mtx contention To: David Gwynne Cc: tech@openbsd.org Date: Tue, 10 Feb 2026 19:21:56 -0500 On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote: > the approach freebsd appears to have taken here is reference counting > the shared cluster. one of the mbufs is picked to contain a reference > counter, and the other mbufs pointed at it and use it to coordinate > when the memory can be freed. > > the diff below implements this idea. of course, most of the complexity > is in handling the deferred free of the "leader" mbuf. it appears > to work, but i haven't pushed it very hard. Works great for me. The socket splicing test that triggers the problem goes from 30 to 45 GBit/sec throughput. The contention on m_extfree is gone. before: http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree after: http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree > id argue m_zero() has an existing bug where it fiddles with another > mbufs m_flags, but not all m_flags operations are serialised with that > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster > as needing zeroing instead. Good catch. > the other option i considered was to allocate something on demand to > hold the cluster reference count, and have it proxy the ext_free_fn > call. Allocating the hot path is not a good idea. We should avoid that. Diff below is OK bluhm. But I am still testing it on more machines. bluhm > Index: kern/uipc_mbuf.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v > diff -u -p -r1.304 uipc_mbuf.c > --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 > +++ kern/uipc_mbuf.c 10 Feb 2026 06:34:04 -0000 > @@ -123,9 +123,8 @@ int max_linkhdr; /* largest link-level > int max_protohdr; /* largest protocol header */ > int max_hdr; /* largest link+protocol header */ > > -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); > +static int m_extfree(struct mbuf *); > > -void m_extfree(struct mbuf *); > void m_zero(struct mbuf *); > > unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ > @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator = > static void (*mextfree_fns[4])(caddr_t, u_int, void *); > static u_int num_extfree_fns; > > +#define MEXT_FREE_ZERO 0x80000000 > + > #define M_DATABUF(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \ > (m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat) > #define M_SIZE(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \ > @@ -403,6 +404,7 @@ struct mbuf * > m_free(struct mbuf *m) > { > struct mbuf *n; > + int refs = 0; > int s; > > if (m == NULL) > @@ -427,51 +429,47 @@ m_free(struct mbuf *m) > #endif /* NPF > 0 */ > } > if (m->m_flags & M_EXT) > - m_extfree(m); > + refs = m_extfree(m); > > - pool_put(&mbpool, m); > + if (refs == 0) > + pool_put(&mbpool, m); > > return (n); > } > > void > -m_extref(struct mbuf *o, struct mbuf *n) > +m_extref(struct mbuf *m, struct mbuf *n) > { > - int refs = MCLISREFERENCED(o); > - > - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); > + struct mbuf *mr; > > - if (refs) > - mtx_enter(&m_extref_mtx); > - n->m_ext.ext_nextref = o->m_ext.ext_nextref; > - n->m_ext.ext_prevref = o; > - o->m_ext.ext_nextref = n; > - n->m_ext.ext_nextref->m_ext.ext_prevref = n; > - if (refs) > - mtx_leave(&m_extref_mtx); > - > - MCLREFDEBUGN((n), __FILE__, __LINE__); > -} > + n->m_flags |= m->m_flags & (M_EXT|M_EXTWR); > + n->m_ext = m->m_ext; > > -static inline u_int > -m_extunref(struct mbuf *m) > -{ > - int refs = 0; > + /* > + * point all the mbufs sharing a cluster at one of them which is > + * the "leader" via the ext_refm pointer. an mbuf can > + * identify itself as the leader if ext_refm points at itself. > + * > + * the default state for mbufs with clusters is to not set > + * ext_refm so it can avoid doing the reference counting. > + * > + * if an mbuf becomes a leader, it can't be freed until all > + * the other mbufs referring to it have been freed. similarly, > + * the cluster can't be removed from a leader until the other > + * references are dropped. > + */ > > - if (!MCLISREFERENCED(m)) > - return (0); > + mr = m->m_ext.ext_refm; > + if (mr == NULL) { > + mr = m; > + m->m_ext.ext_refs = 2; > + m->m_ext.ext_refm = mr; /* mark m as the leader */ > + } else > + atomic_inc_int(&mr->m_ext.ext_refs); > > - mtx_enter(&m_extref_mtx); > - if (MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_ext.ext_prevref = > - m->m_ext.ext_prevref; > - m->m_ext.ext_prevref->m_ext.ext_nextref = > - m->m_ext.ext_nextref; > - refs = 1; > - } > - mtx_leave(&m_extref_mtx); > + n->m_ext.ext_refm = mr; /* point n at the leader */ > > - return (refs); > + MCLREFDEBUGN((n), __FILE__, __LINE__); > } > > /* > @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_ > return num_extfree_fns++; > } > > -void > +static int > m_extfree(struct mbuf *m) > { > - if (m_extunref(m) == 0) { > - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > - m->m_ext.ext_size, m->m_ext.ext_arg); > + struct mbuf *mr; > + unsigned int free_fn; > + unsigned int zero = 0; > + > + mr = m->m_ext.ext_refm; > + if (mr == NULL) > + mr = m; > + else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) { > + /* > + * the cluster is still referenced by another mbuf. > + * the caller can't free this mbuf if it is the > + * leader of a shared cluster. > + */ > + return (m == mr); > } > > - m->m_flags &= ~(M_EXT|M_EXTWR); > + free_fn = mr->m_ext.ext_free_fn; > + if (ISSET(free_fn, MEXT_FREE_ZERO)) { > + zero = 1; > + CLR(free_fn, MEXT_FREE_ZERO); > + } > + > + KASSERTMSG(free_fn < num_extfree_fns, > + "mbuf %p ext_free_fn %u >= num_extfree_fns %u", > + mr, free_fn, num_extfree_fns); > + > + if (zero) > + explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size); > + > + mextfree_fns[free_fn](mr->m_ext.ext_buf, > + mr->m_ext.ext_size, m->m_ext.ext_arg); > + > + if (mr != m) > + pool_put(&mbpool, mr); > + > + return (0); > } > > struct mbuf * > @@ -524,9 +551,8 @@ m_purge(struct mbuf *m) > } > > /* > - * mbuf chain defragmenter. This function uses some evil tricks to defragment > - * an mbuf chain into a single buffer without changing the mbuf pointer. > - * This needs to know a lot of the mbuf internals to make this work. > + * mbuf chain defragmenter. This function moves the data to single mbuf > + * without changing the mbuf pointer. > * The resulting mbuf is not aligned to IP header to assist DMA transfers. > */ > int > @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how) > KASSERT(m->m_flags & M_PKTHDR); > > mbstat_inc(mbs_defrag_alloc); > - if ((m0 = m_gethdr(how, m->m_type)) == NULL) > + > + if ((m0 = m_get(how, m->m_type)) == NULL) > return (ENOBUFS); > - if (m->m_pkthdr.len > MHLEN) { > + if (m->m_pkthdr.len > MLEN) { > MCLGETL(m0, how, m->m_pkthdr.len); > if (!(m0->m_flags & M_EXT)) { > m_free(m0); > @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how) > } > } > m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t)); > - m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len; > + m0->m_len = m->m_pkthdr.len; > > - /* free chain behind and possible ext buf on the first mbuf */ > m_freem(m->m_next); > - m->m_next = NULL; > - if (m->m_flags & M_EXT) > - m_extfree(m); > - > - /* > - * Bounce copy mbuf over to the original mbuf and set everything up. > - * This needs to reset or clear all pointers that may go into the > - * original mbuf chain. > - */ > - if (m0->m_flags & M_EXT) { > - memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext)); > - MCLINITREFERENCE(m); > - m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR); > - m->m_data = m->m_ext.ext_buf; > - } else { > - m->m_data = m->m_pktdat; > - memcpy(m->m_data, m0->m_data, m0->m_len); > - } > - m->m_pkthdr.len = m->m_len = m0->m_len; > - > - m0->m_flags &= ~(M_EXT|M_EXTWR); /* cluster is gone */ > - m_free(m0); > + m->m_data = NULL; > + m->m_len = 0; > + m->m_next = m0; > > return (0); > } > @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le > } > n->m_len = min(len, m->m_len - off); > if (m->m_flags & M_EXT) { > + m_extref(m, n); > n->m_data = m->m_data + off; > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > } else { > n->m_data += m->m_data - > (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); > @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w > return (NULL); > } > if (m->m_flags & M_EXT) { > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > + m_extref(m, n); > n->m_data = m->m_data + len; > } else { > m_align(n, remain); > @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off) > void > m_zero(struct mbuf *m) > { > - if (M_READONLY(m)) { > - mtx_enter(&m_extref_mtx); > - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; > - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; > - } > - mtx_leave(&m_extref_mtx); > + struct mbuf *mr; > + > + if (ISSET(m->m_flags, M_EXT) && > + (mr = m->m_ext.ext_refm) != NULL) { > + /* > + * this variable only transitions in one direction, > + * so if there is a race it will be toward the same > + * result and therefore there is no loss. > + */ > + mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO; > return; > } > > @@ -1525,9 +1533,8 @@ m_print(void *v, > m->m_ext.ext_buf, m->m_ext.ext_size); > (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", > m->m_ext.ext_free_fn, m->m_ext.ext_arg); > - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", > - m->m_ext.ext_nextref, m->m_ext.ext_prevref); > - > + (*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n", > + m->m_ext.ext_refm, m->m_ext.ext_refs); > } > } > > Index: sys/mbuf.h > =================================================================== > RCS file: /cvs/src/sys/sys/mbuf.h,v > diff -u -p -r1.269 mbuf.h > --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 > +++ sys/mbuf.h 10 Feb 2026 06:34:04 -0000 > @@ -36,6 +36,7 @@ > #define _SYS_MBUF_H_ > > #include > +#include > > /* > * Constants related to network buffer management. > @@ -145,8 +146,8 @@ struct mbuf_ext { > void *ext_arg; > u_int ext_free_fn; /* index of free function */ > u_int ext_size; /* size of buffer, for ext_free_fn */ > - struct mbuf *ext_nextref; > - struct mbuf *ext_prevref; > + struct mbuf *ext_refm; /* mbuf with ext_refs */ > + u_int ext_refs; /* number of refs via ext_refm */ > #ifdef DEBUG > const char *ext_ofile; > const char *ext_nfile; > @@ -282,13 +283,23 @@ struct mbuf { > #define MCLREFDEBUGO(m, file, line) > #endif > > -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) > +static inline int > +m_extreferenced(struct mbuf *m) > +{ > + struct mbuf *mr; > + > + mr = m->m_ext.ext_refm; > + if (mr == NULL) > + return (0); > > -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) > + return (atomic_load_int(&mr->m_ext.ext_refs) > 1); > +} > + > +#define MCLISREFERENCED(m) m_extreferenced(m) > > #define MCLINITREFERENCE(m) do { \ > - (m)->m_ext.ext_prevref = (m); \ > - (m)->m_ext.ext_nextref = (m); \ > + (m)->m_ext.ext_refm = NULL; \ > + (m)->m_ext.ext_refs = 0; \ > MCLREFDEBUGO((m), __FILE__, __LINE__); \ > MCLREFDEBUGN((m), NULL, 0); \ > } while (/* CONSTCOND */ 0) > @@ -546,8 +557,6 @@ unsigned int ml_hdatalen(struct mbuf_li > /* > * mbuf queues > */ > - > -#include > > #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \ > { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 } > >