From: Alexander Bluhm Subject: Re: mbuf cluster m_extref_mtx contention To: David Gwynne Cc: tech@openbsd.org Date: Wed, 11 Feb 2026 17:42:39 -0500 On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote: > On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote: > > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote: > > > the approach freebsd appears to have taken here is reference counting > > > the shared cluster. one of the mbufs is picked to contain a reference > > > counter, and the other mbufs pointed at it and use it to coordinate > > > when the memory can be freed. > > > > > > the diff below implements this idea. of course, most of the complexity > > > is in handling the deferred free of the "leader" mbuf. it appears > > > to work, but i haven't pushed it very hard. > > > > Works great for me. The socket splicing test that triggers the > > problem goes from 30 to 45 GBit/sec throughput. The contention on > > m_extfree is gone. > > > > before: > > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree > > after: > > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree > > > > > id argue m_zero() has an existing bug where it fiddles with another > > > mbufs m_flags, but not all m_flags operations are serialised with that > > > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster > > > as needing zeroing instead. > > > > Good catch. > > > > > the other option i considered was to allocate something on demand to > > > hold the cluster reference count, and have it proxy the ext_free_fn > > > call. > > > > Allocating the hot path is not a good idea. We should avoid that. > > unless it's cheap enough... > > > > > Diff below is OK bluhm. But I am still testing it on more machines. > > > > bluhm > > this is the proxy ref thing if you want to give it a go. the code is a > lot simpler at least. It also works and performance is simmilar. But I like the other diff without extra allocations much more. All numbers are here, but the page is a bit messy. http://bluhm.genua.de/netlink/results/2026-02-10T01:48:41Z/netlink.html bluhm > Index: kern/uipc_mbuf.c > =================================================================== > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v > diff -u -p -r1.304 uipc_mbuf.c > --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 > +++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000 > @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level > int max_protohdr; /* largest protocol header */ > int max_hdr; /* largest link+protocol header */ > > -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); > +struct m_ext_refs { > + void *arg; > + u_int free_fn; > + u_int zero; > + struct refcnt refs; > +}; > + > +static struct pool m_ext_refs_pool; > + > +static void m_extfree_refs(caddr_t, u_int, void *); > +u_int m_extfree_refs_fn; > + > +static int m_extref(struct mbuf *, struct mbuf *, int); > +static void m_extfree(struct mbuf *); > > -void m_extfree(struct mbuf *); > void m_zero(struct mbuf *); > > unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ > @@ -174,6 +186,8 @@ mbinit(void) > > pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, > IPL_NET, 0, "mtagpl", NULL); > + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE, > + IPL_NET, 0, "mextrefs", NULL); > > for (i = 0; i < nitems(mclsizes); i++) { > lowbits = mclsizes[i] & ((1 << 10) - 1); > @@ -193,6 +207,7 @@ mbinit(void) > > (void)mextfree_register(m_extfree_pool); > KASSERT(num_extfree_fns == 1); > + m_extfree_refs_fn = mextfree_register(m_extfree_refs); > } > > void > @@ -204,6 +219,7 @@ mbcpuinit(void) > > pool_cache_init(&mbpool); > pool_cache_init(&mtagpool); > + pool_cache_init(&m_ext_refs_pool); > > for (i = 0; i < nitems(mclsizes); i++) > pool_cache_init(&mclpools[i]); > @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, > pool_put(pp, buf); > } > > +int > +m_ext_refs_shared(struct mbuf *m) > +{ > + struct m_ext_refs *mrefs = m->m_ext.ext_arg; > + > + return (refcnt_shared(&mrefs->refs)); > +} > + > +static void > +m_extfree_refs(caddr_t buf, u_int size, void *arg) > +{ > + struct m_ext_refs *mrefs = arg; > + > + if (refcnt_rele(&mrefs->refs)) { > + if (mrefs->zero) > + explicit_bzero(buf, size); > + > + KASSERT(mrefs->free_fn < num_extfree_fns); > + KASSERT(mrefs->free_fn != m_extfree_refs_fn); > + > + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg); > + > + pool_put(&m_ext_refs_pool, mrefs); > + } > +} > + > struct mbuf * > m_free(struct mbuf *m) > { > @@ -434,44 +476,33 @@ m_free(struct mbuf *m) > return (n); > } > > -void > -m_extref(struct mbuf *o, struct mbuf *n) > +static int > +m_extref(struct mbuf *m, struct mbuf *n, int how) > { > - int refs = MCLISREFERENCED(o); > - > - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); > + struct m_ext_refs *mrefs; > > - if (refs) > - mtx_enter(&m_extref_mtx); > - n->m_ext.ext_nextref = o->m_ext.ext_nextref; > - n->m_ext.ext_prevref = o; > - o->m_ext.ext_nextref = n; > - n->m_ext.ext_nextref->m_ext.ext_prevref = n; > - if (refs) > - mtx_leave(&m_extref_mtx); > + if (m->m_ext.ext_free_fn == m_extfree_refs_fn) > + mrefs = m->m_ext.ext_arg; > + else { > + mrefs = pool_get(&m_ext_refs_pool, how); > + if (mrefs == NULL) > + return (ENOMEM); > > - MCLREFDEBUGN((n), __FILE__, __LINE__); > -} > + refcnt_init(&mrefs->refs); > + mrefs->arg = m->m_ext.ext_arg; > + mrefs->free_fn = m->m_ext.ext_free_fn; > + mrefs->zero = 0; > > -static inline u_int > -m_extunref(struct mbuf *m) > -{ > - int refs = 0; > + m->m_ext.ext_arg = mrefs; > + m->m_ext.ext_free_fn = m_extfree_refs_fn; > + } > > - if (!MCLISREFERENCED(m)) > - return (0); > + refcnt_take(&mrefs->refs); > > - mtx_enter(&m_extref_mtx); > - if (MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_ext.ext_prevref = > - m->m_ext.ext_prevref; > - m->m_ext.ext_prevref->m_ext.ext_nextref = > - m->m_ext.ext_nextref; > - refs = 1; > - } > - mtx_leave(&m_extref_mtx); > + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size, > + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs); > > - return (refs); > + return (0); > } > > /* > @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_ > return num_extfree_fns++; > } > > -void > +static void > m_extfree(struct mbuf *m) > { > - if (m_extunref(m) == 0) { > - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > - m->m_ext.ext_size, m->m_ext.ext_arg); > - } > - > + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > + m->m_ext.ext_size, m->m_ext.ext_arg); > + > m->m_flags &= ~(M_EXT|M_EXTWR); > } > > @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le > } > n->m_len = min(len, m->m_len - off); > if (m->m_flags & M_EXT) { > + if (m_extref(m, n, wait) != 0) > + goto nospace; > n->m_data = m->m_data + off; > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > } else { > n->m_data += m->m_data - > (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); > @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w > if (m == NULL) > return (NULL); > remain = m->m_len - len; > + olen = m0->m_pkthdr.len; > if (m0->m_flags & M_PKTHDR) { > MGETHDR(n, wait, m0->m_type); > if (n == NULL) > @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w > return (NULL); > } > n->m_pkthdr.len -= len0; > - olen = m0->m_pkthdr.len; > m0->m_pkthdr.len = len0; > if (remain == 0) { > n->m_next = m->m_next; > @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w > return (NULL); > } > if (m->m_flags & M_EXT) { > - n->m_ext = m->m_ext; > - MCLADDREFERENCE(m, n); > + if (m_extref(m, n, wait) != 0) { > + m_freem(n); > + m0->m_pkthdr.len = olen; > + return (NULL); > + } > n->m_data = m->m_data + len; > } else { > m_align(n, remain); > @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off) > void > m_zero(struct mbuf *m) > { > - if (M_READONLY(m)) { > - mtx_enter(&m_extref_mtx); > - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { > - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; > - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; > - } > - mtx_leave(&m_extref_mtx); > + if (ISSET(m->m_flags, M_EXT) && > + m->m_ext.ext_free_fn == m_extfree_refs_fn) { > + struct m_ext_refs *mrefs = m->m_ext.ext_arg; > + > + /* > + * this variable only transitions in one direction, > + * so if there is a race it will be toward the same > + * result and therefore there is no loss. > + */ > + > + mrefs->zero = 1; > return; > } > > @@ -1525,9 +1561,7 @@ m_print(void *v, > m->m_ext.ext_buf, m->m_ext.ext_size); > (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", > m->m_ext.ext_free_fn, m->m_ext.ext_arg); > - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", > - m->m_ext.ext_nextref, m->m_ext.ext_prevref); > - > + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */ > } > } > > Index: sys/mbuf.h > =================================================================== > RCS file: /cvs/src/sys/sys/mbuf.h,v > diff -u -p -r1.269 mbuf.h > --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 > +++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000 > @@ -145,8 +145,6 @@ struct mbuf_ext { > void *ext_arg; > u_int ext_free_fn; /* index of free function */ > u_int ext_size; /* size of buffer, for ext_free_fn */ > - struct mbuf *ext_nextref; > - struct mbuf *ext_prevref; > #ifdef DEBUG > const char *ext_ofile; > const char *ext_nfile; > @@ -282,13 +280,22 @@ struct mbuf { > #define MCLREFDEBUGO(m, file, line) > #endif > > -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) > +int m_ext_refs_shared(struct mbuf *); > > -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) > +static inline int > +m_extreferenced(struct mbuf *m) > +{ > + extern u_int m_extfree_refs_fn; > + > + if (m->m_ext.ext_free_fn != m_extfree_refs_fn) > + return (0); > + > + return (m_ext_refs_shared(m)); > +} > + > +#define MCLISREFERENCED(m) m_extreferenced(m) > > #define MCLINITREFERENCE(m) do { \ > - (m)->m_ext.ext_prevref = (m); \ > - (m)->m_ext.ext_nextref = (m); \ > MCLREFDEBUGO((m), __FILE__, __LINE__); \ > MCLREFDEBUGN((m), NULL, 0); \ > } while (/* CONSTCOND */ 0) > @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *); > int m_trailingspace(struct mbuf *); > void m_align(struct mbuf *, int); > struct mbuf *m_clget(struct mbuf *, int, u_int); > -void m_extref(struct mbuf *, struct mbuf *); > void m_pool_init(struct pool *, u_int, u_int, const char *); > u_int m_pool_used(void); > void m_extfree_pool(caddr_t, u_int, void *); > >