Download raw body.
mbuf cluster m_extref_mtx contention
On Mon, Mar 30, 2026 at 01:56:38PM +0200, Alexander Bluhm wrote:
> On Tue, Mar 17, 2026 at 02:24:01PM +0100, Alexander Bluhm wrote:
> > On Wed, Feb 11, 2026 at 12:26:15PM +1000, David Gwynne wrote:
> > > this is the proxy ref thing if you want to give it a go. the code is a
> > > lot simpler at least.
> >
> > I have tested both variants of the diff again and with this one I
> > got a panic. It was the sixth repetition of my test runs, this
> > time with btrace profiling turned on.
>
> It is reproduceable and it is triggered by btrace.
i havent spent as much time on this as i would have liked because i
havent been able to reproduce this, but there's a chance that this was
related to the fix committed in src/sys/net/pf_norm.c r1.237 for the pf
frag rb tree topology.
i also found a bug i introduced in m_split, but i havent figured out if
it could cause a crash like this. that bug is fixed in the diff below.
however, i did go through and audit the callers of mbuf functions
that create shared cluster refs, and they all appear to be doing
the right thing and propagate failure. i expected this though, cos the
affected functions can fail because of mbuf allocation issues anyway, so
the callers would already have to be robust.
> *cpu1: uvm_fault(0xffffffff82bb0cb8, 0x2414c5fe0, 0, 1) -> e
>
> *58677 355342 0 0 7 0x14200 softnet0
>
> ddb{1}> trace
> x86_ipi_db(ffff80005a5eaff0) at x86_ipi_db+0x16
> x86_ipi_handler() at x86_ipi_handler+0x80
> Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
> x86_bus_space_io_write_1(3f8,0,20) at x86_bus_space_io_write_1+0x1d
> comcnputc(800,20) at comcnputc+0xdc
> cnputc(20) at cnputc+0x47
> db_putchar(70) at db_putchar+0x42a
> kprintf() at kprintf+0x12ff
> db_printf(ffffffff82662513) at db_printf+0x6d
> db_ktrap(6,0,ffff8000632447a0) at db_ktrap+0x205
> kerntrap(ffff8000632447a0) at kerntrap+0xe2
> alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
> refcnt_rele(fffffd8f371b6d50) at refcnt_rele+0x78
> m_extfree_refs(fffffd8003fc7540,840,fffffd8f371b6d40) at m_extfree_refs+0x31
> m_free(fffffd808accec00) at m_free+0x170
> m_freem(fffffd8089550e00) at m_freem+0x38
> ip6_forward(fffffd8089d77900,0,2) at ip6_forward+0x9bf
> pf_refragment6(ffff800063244c00,fffffd8f080a7010,0,0,0) at pf_refragment6+0x19a
> pf_test(18,3,ffff800000393048,ffff800063244e48) at pf_test+0x83f
> ip6_forward(fffffd8089550e00,ffffffff82b77608,2) at ip6_forward+0x3ce
> ip6_input_if(ffff800063244fd8,ffff800063244fe4,29,0,ffff800000279048,ffffffff82b775d8) at ip6_input_if+0x87d
> ipv6_input(ffff800000279048,fffffd808accec00,ffffffff82b775d8) at ipv6_input+0x42
> if_input_process(ffff800000279048,ffff800063245098,0) at if_input_process+0x20a
> ifiq_process(ffff800000279480) at ifiq_process+0xa1
> taskq_thread(ffff80000002c000) at taskq_thread+0x129
> end trace frame: 0x0, count: -25
>
> ddb{1}> x/i refcnt_rele+0x78
> refcnt_rele+0x78: movq 0(%rsi,%rcx,8),%rsi
>
> /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:933
> 1cc3: 48 63 4f 04 movslq 0x4(%rdi),%rcx
> 1cc7: 48 85 c9 test %rcx,%rcx
> 1cca: 7e c7 jle 1c93 <refcnt_rele+0x33>
> 1ccc: 48 8b 35 00 00 00 00 mov 0(%rip),%rsi # 1cd3 <refcnt_rele+0x73>
> 1cd3: 48 85 f6 test %rsi,%rsi
> 1cd6: 74 bb je 1c93 <refcnt_rele+0x33>
> 1cd8: 48 8b 34 ce mov (%rsi,%rcx,8),%rsi
> * 1cdc: 83 7e 2c 00 cmpl $0x0,0x2c(%rsi)
> 1ce0: 74 b1 je 1c93 <refcnt_rele+0x33>
> 1ce2: 48 8b 7e 10 mov 0x10(%rsi),%rdi
> 1ce6: 4c 8b 5f 18 mov 0x18(%rdi),%r11
> 1cea: 89 c1 mov %eax,%ecx
> 1cec: 41 b8 ff ff ff ff mov $0xffffffff,%r8d
> 1cf2: 41 89 c6 mov %eax,%r14d
> 1cf5: 31 c0 xor %eax,%eax
> 1cf7: e8 00 00 00 00 callq 1cfc <refcnt_rele+0x9c>
> 1cfc: 48 c7 44 24 f8 00 00 movq $0x0,0xfffffffffffffff8(%rsp)
> 1d03: 00 00
> 1d05: 44 89 f0 mov %r14d,%eax
> 1d08: eb 89 jmp 1c93 <refcnt_rele+0x33>
> /home/bluhm/openbsd/cvs/src/sys/kern/kern_synch.c:932
>
> 925 int
> 926 refcnt_rele(struct refcnt *r)
> 927 {
> 928 u_int refs;
> 929
> 930 membar_exit_before_atomic();
> 931 refs = atomic_dec_int_nv(&r->r_refs);
> 932 KASSERT(refs != ~0);
> * 933 TRACEINDEX(refcnt, r->r_traceidx, r, refs + 1, -1);
> 934 if (refs == 0) {
> 935 membar_enter_after_atomic();
> 936 return (1);
> 937 }
> 938 return (0);
> 939 }
>
> 330 #define DT_INDEX_ENTER(func, index, args...) do {
> 331 extern struct dt_probe **_DT_INDEX_P(func);
> 332
> 333 if (__predict_false(dt_tracing) &&
> 334 __predict_false(index > 0) &&
> 335 __predict_true(_DT_INDEX_P(func) != NULL)) {
> * 336 struct dt_probe *dtp = _DT_INDEX_P(func)[index];
> 337
> 338 if(__predict_false(dtp->dtp_recording)) {
> 339 struct dt_provider *dtpv = dtp->dtp_prov;
> 340
> 341 dtpv->dtpv_enter(dtpv, dtp, args);
> 342 }
> 343 }
> 344 } while (0)
>
> r->r_traceidx is invalid so it crashed when dt(4) uses it.
>
> I think this is not a btrace problem, but a use after free. btrace
> just makes it visible. When refcnt_rele() is run, refcnt r must
> be valid memory.
>
> I still prefer the other refcounting diff. It works fine for me
> and maybe a bit faster.
from another perspective, if the use after free is the result
mishandling mbufs, then the other diff is more forgiving of a buggy
behaviour.
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
diff -u -p -r1.304 uipc_mbuf.c
--- kern/uipc_mbuf.c 5 Feb 2026 03:26:00 -0000 1.304
+++ kern/uipc_mbuf.c 22 Apr 2026 06:04:19 -0000
@@ -123,9 +123,21 @@ int max_linkhdr; /* largest link-level
int max_protohdr; /* largest protocol header */
int max_hdr; /* largest link+protocol header */
-struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
+struct m_ext_refs {
+ void *arg;
+ u_int free_fn;
+ u_int zero;
+ struct refcnt refs;
+};
+
+static struct pool m_ext_refs_pool;
+
+static void m_extfree_refs(caddr_t, u_int, void *);
+u_int m_extfree_refs_fn;
+
+static int m_extref(struct mbuf *, struct mbuf *, int);
+static void m_extfree(struct mbuf *);
-void m_extfree(struct mbuf *);
void m_zero(struct mbuf *);
unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */
@@ -174,6 +186,8 @@ mbinit(void)
pool_init(&mtagpool, PACKET_TAG_MAXSIZE + sizeof(struct m_tag), 0,
IPL_NET, 0, "mtagpl", NULL);
+ pool_init(&m_ext_refs_pool, sizeof(struct m_ext_refs), CACHELINESIZE,
+ IPL_NET, 0, "mextrefs", NULL);
for (i = 0; i < nitems(mclsizes); i++) {
lowbits = mclsizes[i] & ((1 << 10) - 1);
@@ -193,6 +207,7 @@ mbinit(void)
(void)mextfree_register(m_extfree_pool);
KASSERT(num_extfree_fns == 1);
+ m_extfree_refs_fn = mextfree_register(m_extfree_refs);
}
void
@@ -204,6 +219,7 @@ mbcpuinit(void)
pool_cache_init(&mbpool);
pool_cache_init(&mtagpool);
+ pool_cache_init(&m_ext_refs_pool);
for (i = 0; i < nitems(mclsizes); i++)
pool_cache_init(&mclpools[i]);
@@ -399,6 +415,32 @@ m_extfree_pool(caddr_t buf, u_int size,
pool_put(pp, buf);
}
+int
+m_ext_refs_shared(struct mbuf *m)
+{
+ struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+ return (refcnt_shared(&mrefs->refs));
+}
+
+static void
+m_extfree_refs(caddr_t buf, u_int size, void *arg)
+{
+ struct m_ext_refs *mrefs = arg;
+
+ if (refcnt_rele(&mrefs->refs)) {
+ if (mrefs->zero)
+ explicit_bzero(buf, size);
+
+ KASSERT(mrefs->free_fn < num_extfree_fns);
+ KASSERT(mrefs->free_fn != m_extfree_refs_fn);
+
+ mextfree_fns[mrefs->free_fn](buf, size, mrefs->arg);
+
+ pool_put(&m_ext_refs_pool, mrefs);
+ }
+}
+
struct mbuf *
m_free(struct mbuf *m)
{
@@ -434,44 +476,33 @@ m_free(struct mbuf *m)
return (n);
}
-void
-m_extref(struct mbuf *o, struct mbuf *n)
+static int
+m_extref(struct mbuf *m, struct mbuf *n, int how)
{
- int refs = MCLISREFERENCED(o);
+ struct m_ext_refs *mrefs;
- n->m_flags |= o->m_flags & (M_EXT|M_EXTWR);
+ if (m->m_ext.ext_free_fn == m_extfree_refs_fn)
+ mrefs = m->m_ext.ext_arg;
+ else {
+ mrefs = pool_get(&m_ext_refs_pool, how);
+ if (mrefs == NULL)
+ return (ENOMEM);
- if (refs)
- mtx_enter(&m_extref_mtx);
- n->m_ext.ext_nextref = o->m_ext.ext_nextref;
- n->m_ext.ext_prevref = o;
- o->m_ext.ext_nextref = n;
- n->m_ext.ext_nextref->m_ext.ext_prevref = n;
- if (refs)
- mtx_leave(&m_extref_mtx);
-
- MCLREFDEBUGN((n), __FILE__, __LINE__);
-}
+ refcnt_init(&mrefs->refs);
+ mrefs->arg = m->m_ext.ext_arg;
+ mrefs->free_fn = m->m_ext.ext_free_fn;
+ mrefs->zero = 0;
-static inline u_int
-m_extunref(struct mbuf *m)
-{
- int refs = 0;
+ m->m_ext.ext_arg = mrefs;
+ m->m_ext.ext_free_fn = m_extfree_refs_fn;
+ }
- if (!MCLISREFERENCED(m))
- return (0);
+ refcnt_take(&mrefs->refs);
- mtx_enter(&m_extref_mtx);
- if (MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_ext.ext_prevref =
- m->m_ext.ext_prevref;
- m->m_ext.ext_prevref->m_ext.ext_nextref =
- m->m_ext.ext_nextref;
- refs = 1;
- }
- mtx_leave(&m_extref_mtx);
+ MEXTADD(n, m->m_ext.ext_buf, m->m_ext.ext_size,
+ m->m_flags & M_EXTWR, m_extfree_refs_fn, mrefs);
- return (refs);
+ return (0);
}
/*
@@ -487,15 +518,13 @@ mextfree_register(void (*fn)(caddr_t, u_
return num_extfree_fns++;
}
-void
+static void
m_extfree(struct mbuf *m)
{
- if (m_extunref(m) == 0) {
- KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
- mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
- m->m_ext.ext_size, m->m_ext.ext_arg);
- }
-
+ KASSERT(m->m_ext.ext_free_fn < num_extfree_fns);
+ mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf,
+ m->m_ext.ext_size, m->m_ext.ext_arg);
+
m->m_flags &= ~(M_EXT|M_EXTWR);
}
@@ -656,9 +685,9 @@ m_copym(struct mbuf *m0, int off, int le
}
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
+ if (m_extref(m, n, wait) != 0)
+ goto nospace;
n->m_data = m->m_data + off;
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
} else {
n->m_data += m->m_data -
(m->m_flags & M_PKTHDR ? m->m_pktdat : m->m_dat);
@@ -1089,8 +1118,12 @@ m_split(struct mbuf *m0, int len0, int w
return (NULL);
}
if (m->m_flags & M_EXT) {
- n->m_ext = m->m_ext;
- MCLADDREFERENCE(m, n);
+ if (m_extref(m, n, wait) != 0) {
+ m_freem(n);
+ if (m0->m_flags & M_PKTHDR)
+ m0->m_pkthdr.len = olen;
+ return (NULL);
+ }
n->m_data = m->m_data + len;
} else {
m_align(n, remain);
@@ -1271,13 +1304,17 @@ m_devget(char *buf, int totlen, int off)
void
m_zero(struct mbuf *m)
{
- if (M_READONLY(m)) {
- mtx_enter(&m_extref_mtx);
- if ((m->m_flags & M_EXT) && MCLISREFERENCED(m)) {
- m->m_ext.ext_nextref->m_flags |= M_ZEROIZE;
- m->m_ext.ext_prevref->m_flags |= M_ZEROIZE;
- }
- mtx_leave(&m_extref_mtx);
+ if (ISSET(m->m_flags, M_EXT) &&
+ m->m_ext.ext_free_fn == m_extfree_refs_fn) {
+ struct m_ext_refs *mrefs = m->m_ext.ext_arg;
+
+ /*
+ * this variable only transitions in one direction,
+ * so if there is a race it will be toward the same
+ * result and therefore there is no loss.
+ */
+
+ mrefs->zero = 1;
return;
}
@@ -1525,9 +1562,7 @@ m_print(void *v,
m->m_ext.ext_buf, m->m_ext.ext_size);
(*pr)("m_ext.ext_free_fn: %u\tm_ext.ext_arg: %p\n",
m->m_ext.ext_free_fn, m->m_ext.ext_arg);
- (*pr)("m_ext.ext_nextref: %p\tm_ext.ext_prevref: %p\n",
- m->m_ext.ext_nextref, m->m_ext.ext_prevref);
-
+ /* if m_ext.ext_free_fn == m_extfree_refs_fn ? */
}
}
Index: sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.269 mbuf.h
--- sys/mbuf.h 5 Feb 2026 03:26:00 -0000 1.269
+++ sys/mbuf.h 22 Apr 2026 06:04:19 -0000
@@ -145,8 +145,6 @@ struct mbuf_ext {
void *ext_arg;
u_int ext_free_fn; /* index of free function */
u_int ext_size; /* size of buffer, for ext_free_fn */
- struct mbuf *ext_nextref;
- struct mbuf *ext_prevref;
#ifdef DEBUG
const char *ext_ofile;
const char *ext_nfile;
@@ -282,13 +280,22 @@ struct mbuf {
#define MCLREFDEBUGO(m, file, line)
#endif
-#define MCLISREFERENCED(m) ((m)->m_ext.ext_nextref != (m))
+int m_ext_refs_shared(struct mbuf *);
-#define MCLADDREFERENCE(o, n) m_extref((o), (n))
+static inline int
+m_extreferenced(struct mbuf *m)
+{
+ extern u_int m_extfree_refs_fn;
+
+ if (m->m_ext.ext_free_fn != m_extfree_refs_fn)
+ return (0);
+
+ return (m_ext_refs_shared(m));
+}
+
+#define MCLISREFERENCED(m) m_extreferenced(m)
#define MCLINITREFERENCE(m) do { \
- (m)->m_ext.ext_prevref = (m); \
- (m)->m_ext.ext_nextref = (m); \
MCLREFDEBUGO((m), __FILE__, __LINE__); \
MCLREFDEBUGN((m), NULL, 0); \
} while (/* CONSTCOND */ 0)
@@ -439,7 +446,6 @@ int m_leadingspace(struct mbuf *);
int m_trailingspace(struct mbuf *);
void m_align(struct mbuf *, int);
struct mbuf *m_clget(struct mbuf *, int, u_int);
-void m_extref(struct mbuf *, struct mbuf *);
void m_pool_init(struct pool *, u_int, u_int, const char *);
u_int m_pool_used(void);
void m_extfree_pool(caddr_t, u_int, void *);
mbuf cluster m_extref_mtx contention