Index | Thread | Search

From:
David Gwynne <david@gwynne.id.au>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Wed, 22 Apr 2026 16:21:42 +1000

Download raw body.

Thread
On Mon, Mar 30, 2026 at 01:56:38PM +0200, Alexander Bluhm wrote:
> On Tue, Mar 17, 2026 at 02:24:01PM +0100, Alexander Bluhm wrote:
> > On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> > > this is the proxy ref thing if you want to give it a go. the code is a
> > > lot simpler at least.
> > 
> > I have tested both variants of the diff again and with this one I
> > got a panic.  It was the sixth repetition of my test runs, this
> > time with btrace profiling turned on.
> 
> It is reproduceable and it is triggered by btrace.

i havent spent as much time on this as i would have liked because i
havent been able to reproduce this, but there's a chance that this was
related to the fix committed in src/sys/net/pf_norm.c r1.237 for the pf
frag rb tree topology.

i also found a bug i introduced in m_split, but i havent figured out if
it could cause a crash like this. that bug is fixed in the diff below.

however, i did go through and audit the callers of mbuf functions
that create shared cluster refs, and they all appear to be doing
the right thing and propagate failure. i expected this though, cos the
affected functions can fail because of mbuf allocation issues anyway, so
the callers would already have to be robust.

> *cpu1: uvm_fault(0xffffffff82bb0cb8, 0x2414c5fe0, 0, 1) -> e
> 
> *58677  355342      0      0  7     0x14200                softnet0
> 
> ddb{1}> trace
> x86_ipi_db(ffff80005a5eaff0) at x86_ipi_db+0x16
> x86_ipi_handler() at x86_ipi_handler+0x80
> Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
> x86_bus_space_io_write_1(3f8,0,20) at x86_bus_space_io_write_1+0x1d
> comcnputc(800,20) at comcnputc+0xdc
> cnputc(20) at cnputc+0x47
> db_putchar(70) at db_putchar+0x42a
> kprintf() at kprintf+0x12ff
> db_printf(ffffffff82662513) at db_printf+0x6d
> db_ktrap(6,0,ffff8000632447a0) at db_ktrap+0x205
> kerntrap(ffff8000632447a0) at kerntrap+0xe2
> alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
> refcnt_rele(fffffd8f371b6d50) at refcnt_rele+0x78
> m_extfree_refs(fffffd8003fc7540,840,fffffd8f371b6d40) at m_extfree_refs+0x31
> m_free(fffffd808accec00) at m_free+0x170
> m_freem(fffffd8089550e00) at m_freem+0x38
> ip6_forward(fffffd8089d77900,0,2) at ip6_forward+0x9bf
> pf_refragment6(ffff800063244c00,fffffd8f080a7010,0,0,0) at pf_refragment6+0x19a
> pf_test(18,3,ffff800000393048,ffff800063244e48) at pf_test+0x83f
> ip6_forward(fffffd8089550e00,ffffffff82b77608,2) at ip6_forward+0x3ce
> ip6_input_if(ffff800063244fd8,ffff800063244fe4,29,0,ffff800000279048,ffffffff82b775d8) at ip6_input_if+0x87d
> ipv6_input(ffff800000279048,fffffd808accec00,ffffffff82b775d8) at ipv6_input+0x42
> if_input_process(ffff800000279048,ffff800063245098,0) at if_input_process+0x20a
> ifiq_process(ffff800000279480) at ifiq_process+0xa1
> taskq_thread(ffff80000002c000) at taskq_thread+0x129
> end trace frame: 0x0, count: -25
> 
> ddb{1}> x/i refcnt_rele+0x78
> refcnt_rele+0x78:       movq    0(%rsi,%rcx,8),%rsi
> 
> /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:933
>     1cc3:       48 63 4f 04             movslq 0x4(%rdi),%rcx
>     1cc7:       48 85 c9                test   %rcx,%rcx
>     1cca:       7e c7                   jle    1c93 <refcnt_rele+0x33>
>     1ccc:       48 8b 35 00 00 00 00    mov    0(%rip),%rsi        # 1cd3 <refcnt_rele+0x73>
>     1cd3:       48 85 f6                test   %rsi,%rsi
>     1cd6:       74 bb                   je     1c93 <refcnt_rele+0x33>
>     1cd8:       48 8b 34 ce             mov    (%rsi,%rcx,8),%rsi
> *   1cdc:       83 7e 2c 00             cmpl   $0x0,0x2c(%rsi)
>     1ce0:       74 b1                   je     1c93 <refcnt_rele+0x33>
>     1ce2:       48 8b 7e 10             mov    0x10(%rsi),%rdi
>     1ce6:       4c 8b 5f 18             mov    0x18(%rdi),%r11
>     1cea:       89 c1                   mov    %eax,%ecx
>     1cec:       41 b8 ff ff ff ff       mov    $0xffffffff,%r8d
>     1cf2:       41 89 c6                mov    %eax,%r14d
>     1cf5:       31 c0                   xor    %eax,%eax
>     1cf7:       e8 00 00 00 00          callq  1cfc <refcnt_rele+0x9c>
>     1cfc:       48 c7 44 24 f8 00 00    movq   $0x0,0xfffffffffffffff8(%rsp)
>     1d03:       00 00
>     1d05:       44 89 f0                mov    %r14d,%eax
>     1d08:       eb 89                   jmp    1c93 <refcnt_rele+0x33>
> /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:932
> 
>    925  int
>    926  refcnt_rele(struct refcnt *r)
>    927  {
>    928          u_int refs;
>    929
>    930          membar_exit_before_atomic();
>    931          refs = atomic_dec_int_nv(&r->r_refs);
>    932          KASSERT(refs != ~0);
> *  933          TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1);
>    934          if (refs == 0) {
>    935                  membar_enter_after_atomic();
>    936                  return (1);
>    937          }
>    938          return (0);
>    939  }
> 
>    330  #define DT_INDEX_ENTER(func, index, args...) do {
>    331          extern struct dt_probe **_DT_INDEX_P(func);
>    332
>    333          if (__predict_false(dt_tracing) &&
>    334              __predict_false(index > 0) &&
>    335              __predict_true(_DT_INDEX_P(func) != NULL)) {
> *  336                  struct dt_probe *dtp = _DT_INDEX_P(func)[index];
>    337
>    338                  if(__predict_false(dtp->dtp_recording)) {
>    339                          struct dt_provider *dtpv = dtp->dtp_prov;
>    340
>    341                          dtpv->dtpv_enter(dtpv, dtp, args);
>    342                  }
>    343          }
>    344  } while (0)
> 
> r->r_traceidx is invalid so it crashed when dt(4) uses it.
> 
> I think this is not a btrace problem, but a use after free.  btrace
> just makes it visible.  When refcnt_rele() is run, refcnt r must
> be valid memory.
> 
> I still prefer the other refcounting diff.  It works fine for me
> and maybe a bit faster.

from another perspective, if the use after free is the result
mishandling mbufs, then the other diff is more forgiving of a buggy
behaviour.

Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.304 uipc_mbuf.c
--- kern/uipc_mbuf.c	5 Feb 2026 03:26:00 -0000	1.304
+++ kern/uipc_mbuf.c	22 Apr 2026 06:04:19 -0000
@@ -123,9 +123,21 @@ int max_linkhdr;		/* largest link-level 
 int max_protohdr;		/* largest protocol header */
 int max_hdr;			/* largest link+protocol header */
 
-struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
+struct m_ext_refs {
+	void		*arg;
+	u_int		 free_fn;
+	u_int		 zero;
+	struct refcnt	 refs;
+};
+
+static struct pool	m_ext_refs_pool;
+
+static void		m_extfree_refs(caddr_t, u_int, void *);
+u_int			m_extfree_refs_fn;
+
+static int		m_extref(struct mbuf *, struct mbuf *, int);
+static void		m_extfree(struct mbuf *);
 
-void	m_extfree(struct mbuf *);
 void	m_zero(struct mbuf *);
 
 unsigned long mbuf_mem_limit;	/* [a] how much memory can be allocated */
@@ -174,6 +186,8 @@ mbinit(void)
 
 	pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
 	    IPL_NET, 0, "mtagpl", NULL);
+	pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
+	    IPL_NET, 0, "mextrefs", NULL);
 
 	for (i = 0; i < nitems(mclsizes); i++) {
 		lowbits = mclsizes[i] & ((1 << 10) - 1);
@@ -193,6 +207,7 @@ mbinit(void)
 
 	(void)mextfree_register(m_extfree_pool);
 	KASSERT(num_extfree_fns == 1);
+	m_extfree_refs_fn = mextfree_register(m_extfree_refs);
 }
 
 void
@@ -204,6 +219,7 @@ mbcpuinit(void)
 
 	pool_cache_init(&mbpool);
 	pool_cache_init(&mtagpool);
+	pool_cache_init(&m_ext_refs_pool);
 
 	for (i = 0; i < nitems(mclsizes); i++)
 		pool_cache_init(&mclpools[i]);
@@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size, 
 	pool_put(pp, buf);
 }
 
+int
+m_ext_refs_shared(struct mbuf *m)
+{
+	struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+	return (refcnt_shared(&mrefs->refs));
+}
+
+static void
+m_extfree_refs(caddr_t buf, u_int size, void *arg)
+{
+	struct m_ext_refs *mrefs = arg;
+
+	if (refcnt_rele(&mrefs->refs)) {
+		if (mrefs->zero)
+			explicit_bzero(buf, size);
+
+		KASSERT(mrefs->free_fn < num_extfree_fns);
+		KASSERT(mrefs->free_fn != m_extfree_refs_fn);
+
+		mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
+
+		pool_put(&m_ext_refs_pool, mrefs);
+	}
+}
+
 struct mbuf *
 m_free(struct mbuf *m)
 {
@@ -434,44 +476,33 @@ m_free(struct mbuf *m)
 	return (n);
 }
 
-void
-m_extref(struct mbuf *o, struct mbuf *n)
+static int
+m_extref(struct mbuf *m, struct mbuf *n, int how)
 {
-	int refs = MCLISREFERENCED(o);
+	struct m_ext_refs *mrefs;
 
-	n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
+	if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
+		mrefs = m->m_ext.ext_arg;
+	else {
+		mrefs = pool_get(&m_ext_refs_pool, how);
+		if (mrefs == NULL)
+			return (ENOMEM);
 
-	if (refs)
-		mtx_enter(&m_extref_mtx);
-	n->m_ext.ext_nextref = o->m_ext.ext_nextref;
-	n->m_ext.ext_prevref = o;
-	o->m_ext.ext_nextref = n;
-	n->m_ext.ext_nextref->m_ext.ext_prevref = n;
-	if (refs)
-		mtx_leave(&m_extref_mtx);
-
-	MCLREFDEBUGN((n), __FILE__, __LINE__);
-}
+		refcnt_init(&mrefs->refs);
+		mrefs->arg = m->m_ext.ext_arg;
+		mrefs->free_fn = m->m_ext.ext_free_fn;
+		mrefs->zero = 0;
 
-static inline u_int
-m_extunref(struct mbuf *m)
-{
-	int refs = 0;
+		m->m_ext.ext_arg = mrefs;
+		m->m_ext.ext_free_fn = m_extfree_refs_fn;
+	}
 
-	if (!MCLISREFERENCED(m))
-		return (0);
+	refcnt_take(&mrefs->refs);
 
-	mtx_enter(&m_extref_mtx);
-	if (MCLISREFERENCED(m)) {
-		m->m_ext.ext_nextref->m_ext.ext_prevref =
-		    m->m_ext.ext_prevref;
-		m->m_ext.ext_prevref->m_ext.ext_nextref =
-		    m->m_ext.ext_nextref;
-		refs = 1;
-	}
-	mtx_leave(&m_extref_mtx);
+	MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
+	    m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
 
-	return (refs);
+	return (0);
 }
 
 /*
@@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
 	return num_extfree_fns++;
 }
 
-void
+static void
 m_extfree(struct mbuf *m)
 {
-	if (m_extunref(m) == 0) {
-		KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
-		mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
-		    m->m_ext.ext_size, m->m_ext.ext_arg);
-	}
-
+	KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
+	mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
+	    m->m_ext.ext_size, m->m_ext.ext_arg);
+ 
 	m->m_flags &= ~(M_EXT|M_EXTWR);
 }
 
@@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
 		}
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
+			if (m_extref(m, n, wait) != 0)
+				goto nospace;
 			n->m_data = m->m_data + off;
-			n->m_ext = m->m_ext;
-			MCLADDREFERENCE(m, n);
 		} else {
 			n->m_data += m->m_data -
 			    (m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
@@ -1089,8 +1118,12 @@ m_split(struct mbuf *m0, int len0, int w
 			return (NULL);
 	}
 	if (m->m_flags & M_EXT) {
-		n->m_ext = m->m_ext;
-		MCLADDREFERENCE(m, n);
+		if (m_extref(m, n, wait) != 0) {
+			m_freem(n);
+			if (m0->m_flags & M_PKTHDR)
+				m0->m_pkthdr.len = olen;
+			return (NULL);
+		}
 		n->m_data = m->m_data + len;
 	} else {
 		m_align(n, remain);
@@ -1271,13 +1304,17 @@ m_devget(char *buf, int totlen, int off)
 void
 m_zero(struct mbuf *m)
 {
-	if (M_READONLY(m)) {
-		mtx_enter(&m_extref_mtx);
-		if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
-			m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
-			m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
-		}
-		mtx_leave(&m_extref_mtx);
+	if (ISSET(m->m_flags, M_EXT) &&
+	    m->m_ext.ext_free_fn == m_extfree_refs_fn) {
+		struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+		/*
+		 * this variable only transitions in one direction,
+		 * so if there is a race it will be toward the same
+		 * result and therefore there is no loss.
+		 */
+
+		mrefs->zero = 1;
 		return;
 	}
 
@@ -1525,9 +1562,7 @@ m_print(void *v,
 		    m->m_ext.ext_buf, m->m_ext.ext_size);
 		(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
 		    m->m_ext.ext_free_fn, m->m_ext.ext_arg);
-		(*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
-		    m->m_ext.ext_nextref, m->m_ext.ext_prevref);
-
+		/* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
 	}
 }
 
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.269 mbuf.h
--- sys/mbuf.h	5 Feb 2026 03:26:00 -0000	1.269
+++ sys/mbuf.h	22 Apr 2026 06:04:19 -0000
@@ -145,8 +145,6 @@ struct mbuf_ext {
 	void	*ext_arg;
 	u_int	ext_free_fn;		/* index of free function */
 	u_int	ext_size;		/* size of buffer, for ext_free_fn */
-	struct mbuf *ext_nextref;
-	struct mbuf *ext_prevref;
 #ifdef DEBUG
 	const char *ext_ofile;
 	const char *ext_nfile;
@@ -282,13 +280,22 @@ struct mbuf {
 #define MCLREFDEBUGO(m, file, line)
 #endif
 
-#define	MCLISREFERENCED(m)	((m)->m_ext.ext_nextref != (m))
+int	m_ext_refs_shared(struct mbuf *);
 
-#define	MCLADDREFERENCE(o, n)	m_extref((o), (n))
+static inline int
+m_extreferenced(struct mbuf *m)
+{
+	extern u_int m_extfree_refs_fn;
+
+	if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
+		return (0);
+
+	return (m_ext_refs_shared(m));
+}
+
+#define	MCLISREFERENCED(m)	m_extreferenced(m)
 
 #define	MCLINITREFERENCE(m)	do {					\
-		(m)->m_ext.ext_prevref = (m);				\
-		(m)->m_ext.ext_nextref = (m);				\
 		MCLREFDEBUGO((m), __FILE__, __LINE__);			\
 		MCLREFDEBUGN((m), NULL, 0);				\
 	} while (/* CONSTCOND */ 0)
@@ -439,7 +446,6 @@ int	m_leadingspace(struct mbuf *);
 int	m_trailingspace(struct mbuf *);
 void	m_align(struct mbuf *, int);
 struct mbuf *m_clget(struct mbuf *, int, u_int);
-void	m_extref(struct mbuf *, struct mbuf *);
 void	m_pool_init(struct pool *, u_int, u_int, const char *);
 u_int	m_pool_used(void);
 void	m_extfree_pool(caddr_t, u_int, void *);