From: David Gwynne Subject: Re: mbuf cluster m_extref_mtx contention To: Alexander Bluhm Cc: tech@openbsd.org Date: Wed, 11 Feb 2026 12:26:15 +1000 On Tue, Feb 10, 2026 at 07:21:56PM -0500, Alexander Bluhm wrote: > On Tue, Feb 10, 2026 at 05:11:14PM +1000, David Gwynne wrote: > > the approach freebsd appears to have taken here is reference counting > > the shared cluster. one of the mbufs is picked to contain a reference > > counter, and the other mbufs pointed at it and use it to coordinate > > when the memory can be freed. > > > > the diff below implements this idea. of course, most of the complexity > > is in handling the deferred free of the "leader" mbuf. it appears > > to work, but i haven't pushed it very hard. > > Works great for me. The socket splicing test that triggers the > problem goes from 30 to 45 GBit/sec throughput. The contention on > m_extfree is gone. > > before: > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree > after: > http://bluhm.genua.de/netlink/results/2026-02-10T01%3A48%3A41Z/patch-dlg-mbuf-ext-refs.1/modify-none/iface-ice0/pseudo-none/btrace-kstack.0/btrace/ssh_root%40lt40_iperf3_-6_-cfdd7%3Ae83e%3A66bd%3A1011%3A%3A2_-w200k_-P15_-t10-btrace-kstack.svg?s=extfree > > > id argue m_zero() has an existing bug where it fiddles with another > > mbufs m_flags, but not all m_flags operations are serialised with that > > mutex. this (ab)uses spare bits in ext_free_fn to mark a shared cluster > > as needing zeroing instead. > > Good catch. > > > the other option i considered was to allocate something on demand to > > hold the cluster reference count, and have it proxy the ext_free_fn > > call. > > Allocating the hot path is not a good idea. We should avoid that. unless it's cheap enough... > > Diff below is OK bluhm. But I am still testing it on more machines. > > bluhm this is the proxy ref thing if you want to give it a go. the code is a lot simpler at least. Index: kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v diff -u -p -r1.304 uipc_mbuf.c --- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304 +++ kern/uipc_mbuf.c 11 Feb 2026 01:29:41 -0000 @@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ -struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); +struct m_ext_refs { + void *arg; + u_int free_fn; + u_int zero; + struct refcnt refs; +}; + +static struct pool m_ext_refs_pool; + +static void m_extfree_refs(caddr_t, u_int, void *); +u_int m_extfree_refs_fn; + +static int m_extref(struct mbuf *, struct mbuf *, int); +static void m_extfree(struct mbuf *); -void m_extfree(struct mbuf *); void m_zero(struct mbuf *); unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ @@ -174,6 +186,8 @@ mbinit(void) pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0, IPL_NET, 0, "mtagpl", NULL); + pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE, + IPL_NET, 0, "mextrefs", NULL); for (i = 0; i < nitems(mclsizes); i++) { lowbits = mclsizes[i] & ((1 << 10) - 1); @@ -193,6 +207,7 @@ mbinit(void) (void)mextfree_register(m_extfree_pool); KASSERT(num_extfree_fns == 1); + m_extfree_refs_fn = mextfree_register(m_extfree_refs); } void @@ -204,6 +219,7 @@ mbcpuinit(void) pool_cache_init(&mbpool); pool_cache_init(&mtagpool); + pool_cache_init(&m_ext_refs_pool); for (i = 0; i < nitems(mclsizes); i++) pool_cache_init(&mclpools[i]); @@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, pool_put(pp, buf); } +int +m_ext_refs_shared(struct mbuf *m) +{ + struct m_ext_refs *mrefs = m->m_ext.ext_arg; + + return (refcnt_shared(&mrefs->refs)); +} + +static void +m_extfree_refs(caddr_t buf, u_int size, void *arg) +{ + struct m_ext_refs *mrefs = arg; + + if (refcnt_rele(&mrefs->refs)) { + if (mrefs->zero) + explicit_bzero(buf, size); + + KASSERT(mrefs->free_fn < num_extfree_fns); + KASSERT(mrefs->free_fn != m_extfree_refs_fn); + + mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg); + + pool_put(&m_ext_refs_pool, mrefs); + } +} + struct mbuf * m_free(struct mbuf *m) { @@ -434,44 +476,33 @@ m_free(struct mbuf *m) return (n); } -void -m_extref(struct mbuf *o, struct mbuf *n) +static int +m_extref(struct mbuf *m, struct mbuf *n, int how) { - int refs = MCLISREFERENCED(o); - - n->m_flags |= o->m_flags & (M_EXT|M_EXTWR); + struct m_ext_refs *mrefs; - if (refs) - mtx_enter(&m_extref_mtx); - n->m_ext.ext_nextref = o->m_ext.ext_nextref; - n->m_ext.ext_prevref = o; - o->m_ext.ext_nextref = n; - n->m_ext.ext_nextref->m_ext.ext_prevref = n; - if (refs) - mtx_leave(&m_extref_mtx); + if (m->m_ext.ext_free_fn == m_extfree_refs_fn) + mrefs = m->m_ext.ext_arg; + else { + mrefs = pool_get(&m_ext_refs_pool, how); + if (mrefs == NULL) + return (ENOMEM); - MCLREFDEBUGN((n), __FILE__, __LINE__); -} + refcnt_init(&mrefs->refs); + mrefs->arg = m->m_ext.ext_arg; + mrefs->free_fn = m->m_ext.ext_free_fn; + mrefs->zero = 0; -static inline u_int -m_extunref(struct mbuf *m) -{ - int refs = 0; + m->m_ext.ext_arg = mrefs; + m->m_ext.ext_free_fn = m_extfree_refs_fn; + } - if (!MCLISREFERENCED(m)) - return (0); + refcnt_take(&mrefs->refs); - mtx_enter(&m_extref_mtx); - if (MCLISREFERENCED(m)) { - m->m_ext.ext_nextref->m_ext.ext_prevref = - m->m_ext.ext_prevref; - m->m_ext.ext_prevref->m_ext.ext_nextref = - m->m_ext.ext_nextref; - refs = 1; - } - mtx_leave(&m_extref_mtx); + MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size, + m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs); - return (refs); + return (0); } /* @@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_ return num_extfree_fns++; } -void +static void m_extfree(struct mbuf *m) { - if (m_extunref(m) == 0) { - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); - mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, - m->m_ext.ext_size, m->m_ext.ext_arg); - } - + KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); + mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, + m->m_ext.ext_size, m->m_ext.ext_arg); + m->m_flags &= ~(M_EXT|M_EXTWR); } @@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { + if (m_extref(m, n, wait) != 0) + goto nospace; n->m_data = m->m_data + off; - n->m_ext = m->m_ext; - MCLADDREFERENCE(m, n); } else { n->m_data += m->m_data - (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat); @@ -1049,6 +1078,7 @@ m_split(struct mbuf *m0, int len0, int w if (m == NULL) return (NULL); remain = m->m_len - len; + olen = m0->m_pkthdr.len; if (m0->m_flags & M_PKTHDR) { MGETHDR(n, wait, m0->m_type); if (n == NULL) @@ -1058,7 +1088,6 @@ m_split(struct mbuf *m0, int len0, int w return (NULL); } n->m_pkthdr.len -= len0; - olen = m0->m_pkthdr.len; m0->m_pkthdr.len = len0; if (remain == 0) { n->m_next = m->m_next; @@ -1089,8 +1118,11 @@ m_split(struct mbuf *m0, int len0, int w return (NULL); } if (m->m_flags & M_EXT) { - n->m_ext = m->m_ext; - MCLADDREFERENCE(m, n); + if (m_extref(m, n, wait) != 0) { + m_freem(n); + m0->m_pkthdr.len = olen; + return (NULL); + } n->m_data = m->m_data + len; } else { m_align(n, remain); @@ -1271,13 +1303,17 @@ m_devget(char *buf, int totlen, int off) void m_zero(struct mbuf *m) { - if (M_READONLY(m)) { - mtx_enter(&m_extref_mtx); - if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) { - m->m_ext.ext_nextref->m_flags |= M_ZEROIZE; - m->m_ext.ext_prevref->m_flags |= M_ZEROIZE; - } - mtx_leave(&m_extref_mtx); + if (ISSET(m->m_flags, M_EXT) && + m->m_ext.ext_free_fn == m_extfree_refs_fn) { + struct m_ext_refs *mrefs = m->m_ext.ext_arg; + + /* + * this variable only transitions in one direction, + * so if there is a race it will be toward the same + * result and therefore there is no loss. + */ + + mrefs->zero = 1; return; } @@ -1525,9 +1561,7 @@ m_print(void *v, m->m_ext.ext_buf, m->m_ext.ext_size); (*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n", m->m_ext.ext_free_fn, m->m_ext.ext_arg); - (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n", - m->m_ext.ext_nextref, m->m_ext.ext_prevref); - + /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */ } } Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v diff -u -p -r1.269 mbuf.h --- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269 +++ sys/mbuf.h 11 Feb 2026 01:29:41 -0000 @@ -145,8 +145,6 @@ struct mbuf_ext { void *ext_arg; u_int ext_free_fn; /* index of free function */ u_int ext_size; /* size of buffer, for ext_free_fn */ - struct mbuf *ext_nextref; - struct mbuf *ext_prevref; #ifdef DEBUG const char *ext_ofile; const char *ext_nfile; @@ -282,13 +280,22 @@ struct mbuf { #define MCLREFDEBUGO(m, file, line) #endif -#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m)) +int m_ext_refs_shared(struct mbuf *); -#define MCLADDREFERENCE(o, n) m_extref((o), (n)) +static inline int +m_extreferenced(struct mbuf *m) +{ + extern u_int m_extfree_refs_fn; + + if (m->m_ext.ext_free_fn != m_extfree_refs_fn) + return (0); + + return (m_ext_refs_shared(m)); +} + +#define MCLISREFERENCED(m) m_extreferenced(m) #define MCLINITREFERENCE(m) do { \ - (m)->m_ext.ext_prevref = (m); \ - (m)->m_ext.ext_nextref = (m); \ MCLREFDEBUGO((m), __FILE__, __LINE__); \ MCLREFDEBUGN((m), NULL, 0); \ } while (/* CONSTCOND */ 0) @@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *); int m_trailingspace(struct mbuf *); void m_align(struct mbuf *, int); struct mbuf *m_clget(struct mbuf *, int, u_int); -void m_extref(struct mbuf *, struct mbuf *); void m_pool_init(struct pool *, u_int, u_int, const char *); u_int m_pool_used(void); void m_extfree_pool(caddr_t, u_int, void *);