From: David Gwynne Subject: Re: mbuf cluster m_extref_mtx contention To: Alexander Bluhm Cc: tech@openbsd.org Date: Sat, 14 Mar 2026 10:25:33 +1000 On Wed, Feb 25, 2026 at 11:02:59PM +0100, Alexander Bluhm wrote: > Hi David, > > I habe tested your diff again and throughput in my splicing test > goes from 35 GBit/sec to 65 GBit/sec. Could you please commit it, > so I can work on the next bottleneck. > > OK bluhm@ im much more comfortable with the proxy version instead of the refcount on a leader version. the leader version adds a constraint where the leader can't detach a cluster until all references are dropped. the consequence of this is immediately demonstrated in m_defrag, and it required auditing the rest of the kernel to make sure that nowhere else removed or replaced a cluster on an mbuf. it feels like a big footgun that's going to hurt someone in the future. delaying the actual free of the leader mbuf is ugly too. i'd prefer to give up a little bit of performance in exchange for less dangerous and ugly code. > > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote: > > Index: kern/uipc_mbuf.c > > =================================================================== > > RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v > > diff -u -p -r1.304 uipc_mbuf.c > > --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 > > +++ kern/uipc_mbuf.c 10 Feb 2026 06:34:04 -0000 > > @@ -123,9 +123,8 @@ int max_linkhdr; /* largest link-level > > int max_protohdr; /* largest protocol header */ > > int max_hdr; /* largest link+protocol header */ > > > > -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); > > +static int m_extfree(struct mbuf *); > > > > -void m_extfree(struct mbuf *); > > void m_zero(struct mbuf *); > > > > unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ > > @@ -143,6 +142,8 @@ struct pool_allocator m_pool_allocator = > > static void (*mextfree_fns[4])(caddr_t, u_int, void *); > > static u_int num_extfree_fns; > > > > +#define MEXT_FREE_ZERO 0x80000000 > > + > > #define M_DATABUF(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf : \ > > (m)->m_flags & M_PKTHDR ? (m)->m_pktdat : (m)->m_dat) > > #define M_SIZE(m) ((m)->m_flags & M_EXT ? (m)->m_ext.ext_size : \ > > @@ -403,6 +404,7 @@ struct mbuf * > > m_free(struct mbuf *m) > > { > > struct mbuf *n; > > + int refs = 0; > > int s; > > > > if (m == NULL) > > @@ -427,51 +429,47 @@ m_free(struct mbuf *m) > > #endif /* NPF > 0 */ > > } > > if (m->m_flags & M_EXT) > > - m_extfree(m); > > + refs = m_extfree(m); > > > > - pool_put(&mbpool, m); > > + if (refs == 0) > > + pool_put(&mbpool, m); > > > > return (n); > > } > > > > void > > -m_extref(struct mbuf *o, struct mbuf *n) > > +m_extref(struct mbuf *m, struct mbuf *n) > > { > > - int refs = MCLISREFERENCED(o); > > - > > - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); > > + struct mbuf *mr; > > > > - if (refs) > > - mtx_enter(&m_extref_mtx); > > - n->m_ext.ext_nextref = o->m_ext.ext_nextref; > > - n->m_ext.ext_prevref = o; > > - o->m_ext.ext_nextref = n; > > - n->m_ext.ext_nextref->m_ext.ext_prevref = n; > > - if (refs) > > - mtx_leave(&m_extref_mtx); > > - > > - MCLREFDEBUGN((n), __FILE__, __LINE__); > > -} > > + n->m_flags |= m->m_flags & (M_EXT|M_EXTWR); > > + n->m_ext = m->m_ext; > > > > -static inline u_int > > -m_extunref(struct mbuf *m) > > -{ > > - int refs = 0; > > + /* > > + * point all the mbufs sharing a cluster at one of them which is > > + * the "leader" via the ext_refm pointer. an mbuf can > > + * identify itself as the leader if ext_refm points at itself. > > + * > > + * the default state for mbufs with clusters is to not set > > + * ext_refm so it can avoid doing the reference counting. > > + * > > + * if an mbuf becomes a leader, it can't be freed until all > > + * the other mbufs referring to it have been freed. similarly, > > + * the cluster can't be removed from a leader until the other > > + * references are dropped. > > + */ > > > > - if (!MCLISREFERENCED(m)) > > - return (0); > > + mr = m->m_ext.ext_refm; > > + if (mr == NULL) { > > + mr = m; > > + m->m_ext.ext_refs = 2; > > + m->m_ext.ext_refm = mr; /* mark m as the leader */ > > + } else > > + atomic_inc_int(&mr->m_ext.ext_refs); > > > > - mtx_enter(&m_extref_mtx); > > - if (MCLISREFERENCED(m)) { > > - m->m_ext.ext_nextref->m_ext.ext_prevref = > > - m->m_ext.ext_prevref; > > - m->m_ext.ext_prevref->m_ext.ext_nextref = > > - m->m_ext.ext_nextref; > > - refs = 1; > > - } > > - mtx_leave(&m_extref_mtx); > > + n->m_ext.ext_refm = mr; /* point n at the leader */ > > > > - return (refs); > > + MCLREFDEBUGN((n), __FILE__, __LINE__); > > } > > > > /* > > @@ -487,16 +485,45 @@ mextfree_register(void (*fn)(caddr_t, u_ > > return num_extfree_fns++; > > } > > > > -void > > +static int > > m_extfree(struct mbuf *m) > > { > > - if (m_extunref(m) == 0) { > > - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); > > - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, > > - m->m_ext.ext_size, m->m_ext.ext_arg); > > + struct mbuf *mr; > > + unsigned int free_fn; > > + unsigned int zero = 0; > > + > > + mr = m->m_ext.ext_refm; > > + if (mr == NULL) > > + mr = m; > > + else if (atomic_dec_int_nv(&mr->m_ext.ext_refs) > 0) { > > + /* > > + * the cluster is still referenced by another mbuf. > > + * the caller can't free this mbuf if it is the > > + * leader of a shared cluster. > > + */ > > + return (m == mr); > > } > > > > - m->m_flags &= ~(M_EXT|M_EXTWR); > > + free_fn = mr->m_ext.ext_free_fn; > > + if (ISSET(free_fn, MEXT_FREE_ZERO)) { > > + zero = 1; > > + CLR(free_fn, MEXT_FREE_ZERO); > > + } > > + > > + KASSERTMSG(free_fn < num_extfree_fns, > > + "mbuf %p ext_free_fn %u >= num_extfree_fns %u", > > + mr, free_fn, num_extfree_fns); > > + > > + if (zero) > > + explicit_bzero(mr->m_ext.ext_buf, mr->m_ext.ext_size); > > + > > + mextfree_fns[free_fn](mr->m_ext.ext_buf, > > + mr->m_ext.ext_size, m->m_ext.ext_arg); > > + > > + if (mr != m) > > + pool_put(&mbpool, mr); > > + > > + return (0); > > } > > > > struct mbuf * > > @@ -524,9 +551,8 @@ m_purge(struct mbuf *m) > > } > > > > /* > > - * mbuf chain defragmenter. This function uses some evil tricks to defragment > > - * an mbuf chain into a single buffer without changing the mbuf pointer. > > - * This needs to know a lot of the mbuf internals to make this work. > > + * mbuf chain defragmenter. This function moves the data to single mbuf > > + * without changing the mbuf pointer. > > * The resulting mbuf is not aligned to IP header to assist DMA transfers. > > */ > > int > > @@ -540,9 +566,10 @@ m_defrag(struct mbuf *m, int how) > > KASSERT(m->m_flags & M_PKTHDR); > > > > mbstat_inc(mbs_defrag_alloc); > > - if ((m0 = m_gethdr(how, m->m_type)) == NULL) > > + > > + if ((m0 = m_get(how, m->m_type)) == NULL) > > return (ENOBUFS); > > - if (m->m_pkthdr.len > MHLEN) { > > + if (m->m_pkthdr.len > MLEN) { > > MCLGETL(m0, how, m->m_pkthdr.len); > > if (!(m0->m_flags & M_EXT)) { > > m_free(m0); > > @@ -550,32 +577,12 @@ m_defrag(struct mbuf *m, int how) > > } > > } > > m_copydata(m, 0, m->m_pkthdr.len, mtod(m0, caddr_t)); > > - m0->m_pkthdr.len = m0->m_len = m->m_pkthdr.len; > > + m0->m_len = m->m_pkthdr.len; > > > > - /* free chain behind and possible ext buf on the first mbuf */ > > m_freem(m->m_next); > > - m->m_next = NULL; > > - if (m->m_flags & M_EXT) > > - m_extfree(m); > > - > > - /* > > - * Bounce copy mbuf over to the original mbuf and set everything up. > > - * This needs to reset or clear all pointers that may go into the > > - * original mbuf chain. > > - */ > > - if (m0->m_flags & M_EXT) { > > - memcpy(&m->m_ext, &m0->m_ext, sizeof(struct mbuf_ext)); > > - MCLINITREFERENCE(m); > > - m->m_flags |= m0->m_flags & (M_EXT|M_EXTWR); > > - m->m_data = m->m_ext.ext_buf; > > - } else { > > - m->m_data = m->m_pktdat; > > - memcpy(m->m_data, m0->m_data, m0->m_len); > > - } > > - m->m_pkthdr.len = m->m_len = m0->m_len; > > - > > - m0->m_flags &= ~(M_EXT|M_EXTWR); /* cluster is gone */ > > - m_free(m0); > > + m->m_data = NULL; > > + m->m_len = 0; > > + m->m_next = m0; > > > > return (0); > > } > > @@ -656,9 +663,8 @@ m_copym(struct mbuf *m0, int off, int le > > } > > n->m_len = min(len, m->m_len - off); > > if (m->m_flags & M_EXT) { > > + m_extref(m, n); > > n->m_data = m->m_data + off; > > - n->m_ext = m->m_ext; > > - MCLADDREFERENCE(m, n); > > } else { > > n->m_data += m->m_data - > > (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); > > @@ -1089,8 +1095,7 @@ m_split(struct mbuf *m0, int len0, int w > > return (NULL); > > } > > if (m->m_flags & M_EXT) { > > - n->m_ext = m->m_ext; > > - MCLADDREFERENCE(m, n); > > + m_extref(m, n); > > n->m_data = m->m_data + len; > > } else { > > m_align(n, remain); > > @@ -1271,13 +1276,16 @@ m_devget(char *buf, int totlen, int off) > > void > > m_zero(struct mbuf *m) > > { > > - if (M_READONLY(m)) { > > - mtx_enter(&m_extref_mtx); > > - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { > > - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; > > - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; > > - } > > - mtx_leave(&m_extref_mtx); > > + struct mbuf *mr; > > + > > + if (ISSET(m->m_flags, M_EXT) && > > + (mr = m->m_ext.ext_refm) != NULL) { > > + /* > > + * this variable only transitions in one direction, > > + * so if there is a race it will be toward the same > > + * result and therefore there is no loss. > > + */ > > + mr->m_ext.ext_free_fn |= MEXT_FREE_ZERO; > > return; > > } > > > > @@ -1525,9 +1533,8 @@ m_print(void *v, > > m->m_ext.ext_buf, m->m_ext.ext_size); > > (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", > > m->m_ext.ext_free_fn, m->m_ext.ext_arg); > > - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", > > - m->m_ext.ext_nextref, m->m_ext.ext_prevref); > > - > > + (*pr)("m_ext.ext_refm: %p\tm_ext.ext_refs: %u\n", > > + m->m_ext.ext_refm, m->m_ext.ext_refs); > > } > > } > > > > Index: sys/mbuf.h > > =================================================================== > > RCS file: /cvs/src/sys/sys/mbuf.h,v > > diff -u -p -r1.269 mbuf.h > > --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 > > +++ sys/mbuf.h 10 Feb 2026 06:34:04 -0000 > > @@ -36,6 +36,7 @@ > > #define _SYS_MBUF_H_ > > > > #include > > +#include > > > > /* > > * Constants related to network buffer management. > > @@ -145,8 +146,8 @@ struct mbuf_ext { > > void *ext_arg; > > u_int ext_free_fn; /* index of free function */ > > u_int ext_size; /* size of buffer, for ext_free_fn */ > > - struct mbuf *ext_nextref; > > - struct mbuf *ext_prevref; > > + struct mbuf *ext_refm; /* mbuf with ext_refs */ > > + u_int ext_refs; /* number of refs via ext_refm */ > > #ifdef DEBUG > > const char *ext_ofile; > > const char *ext_nfile; > > @@ -282,13 +283,23 @@ struct mbuf { > > #define MCLREFDEBUGO(m, file, line) > > #endif > > > > -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) > > +static inline int > > +m_extreferenced(struct mbuf *m) > > +{ > > + struct mbuf *mr; > > + > > + mr = m->m_ext.ext_refm; > > + if (mr == NULL) > > + return (0); > > > > -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) > > + return (atomic_load_int(&mr->m_ext.ext_refs) > 1); > > +} > > + > > +#define MCLISREFERENCED(m) m_extreferenced(m) > > > > #define MCLINITREFERENCE(m) do { \ > > - (m)->m_ext.ext_prevref = (m); \ > > - (m)->m_ext.ext_nextref = (m); \ > > + (m)->m_ext.ext_refm = NULL; \ > > + (m)->m_ext.ext_refs = 0; \ > > MCLREFDEBUGO((m), __FILE__, __LINE__); \ > > MCLREFDEBUGN((m), NULL, 0); \ > > } while (/* CONSTCOND */ 0) > > @@ -546,8 +557,6 @@ unsigned int ml_hdatalen(struct mbuf_li > > /* > > * mbuf queues > > */ > > - > > -#include > > > > #define MBUF_QUEUE_INITIALIZER(_maxlen, _ipl) \ > > { MUTEX_INITIALIZER(_ipl), MBUF_LIST_INITIALIZER(), (_maxlen), 0 } > >