Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
Re: mbuf cluster m_extref_mtx contention
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Mon, 9 Feb 2026 08:56:58 +0100

Download raw body.

Thread
Can we please just remove this stupid mutex and replace this linked circle
queue with a lock free implementation?  We really don't need yet another
cache of zombie mbufs.

There are many ways how to do that that all will perform much better.
There is very little contention on the actuall mbuf headers so some CAS
based loop should perform very well.

On Sun, Feb 08, 2026 at 10:32:29AM -0500, Alexander Bluhm wrote:
> Hi,
> 
> I some of my tests, especially with socket splicing, I see contention
> on the m_extref_mtx mutex.  It is a global lock to list mbufs that
> point to the same external cluster.
> 
> This happens when freeing mbuf chains in the TX path of the driver
> while the TCP stack still has a reference to the cluster for possible
> retransmits.  This diff avoids to grab the mutex for each m_freem().
> Instead the mbufs are cached in a per CPU mbuf list.  When 50 mbufs
> have been collected, the mutex is taken once to free all of them.
> 
> The number 50 is arbitrary, it means at most 4 MB of memory per CPU
> may be wasted.  Should I add an #ifdef MULTIPROCESSOR ?

4MB per CPU on a system with 80 cpus is 320MB. That is a lot of memory.
Especially since there is this problem that when the mbuf memory usage
reaches around 90% of usage the system will fail various socket calls.
 
> opinions?  ok?
> 
> bluhm
> 
> Index: kern/uipc_mbuf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v
> diff -u -p -r1.302 uipc_mbuf.c
> --- kern/uipc_mbuf.c	6 Aug 2025 14:00:33 -0000	1.302
> +++ kern/uipc_mbuf.c	30 Dec 2025 22:26:06 -0000
> @@ -124,6 +124,7 @@ int max_protohdr;		/* largest protocol h
>  int max_hdr;			/* largest link+protocol header */
>  
>  struct	mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
> +struct	cpumem *m_extfree_cpu;
>  
>  void	m_extfree(struct mbuf *);
>  void	m_zero(struct mbuf *);
> @@ -207,6 +208,8 @@ mbcpuinit(void)
>  
>  	for (i = 0; i < nitems(mclsizes); i++)
>  		pool_cache_init(&mclpools[i]);
> +
> +	m_extfree_cpu = cpumem_malloc(sizeof(struct mbuf_list), M_MBUF);
>  }
>  
>  int
> @@ -426,9 +429,28 @@ m_free(struct mbuf *m)
>  		pf_mbuf_unlink_inpcb(m);
>  #endif	/* NPF > 0 */
>  	}
> -	if (m->m_flags & M_EXT)
> -		m_extfree(m);
> +	if (m->m_flags & M_EXT) {
> +		if (MCLISREFERENCED(m)) {
> +			struct mbuf_list *ml;
> +
> +			ml = cpumem_enter(m_extfree_cpu);
> +			s = splnet();
> +			ml_enqueue(ml, m);
> +			if (ml_len(ml) > 50) {
> +				mtx_enter(&m_extref_mtx);
> +				MBUF_LIST_FOREACH(ml, m)
> +					m_extfree(m);
> +				mtx_leave(&m_extref_mtx);
> +				while ((m = ml_dequeue(ml)) != NULL)
> +					pool_put(&mbpool, m);
> +			}
> +			splx(s);
> +			cpumem_leave(m_extfree_cpu, ml);
>  
> +			return (n);
> +		}
> +		m_extfree(m);
> +	}
>  	pool_put(&mbpool, m);
>  
>  	return (n);
> @@ -456,22 +478,15 @@ m_extref(struct mbuf *o, struct mbuf *n)
>  static inline u_int
>  m_extunref(struct mbuf *m)
>  {
> -	int refs = 0;
> -
>  	if (!MCLISREFERENCED(m))
>  		return (0);
>  
> -	mtx_enter(&m_extref_mtx);
> -	if (MCLISREFERENCED(m)) {
> -		m->m_ext.ext_nextref->m_ext.ext_prevref =
> -		    m->m_ext.ext_prevref;
> -		m->m_ext.ext_prevref->m_ext.ext_nextref =
> -		    m->m_ext.ext_nextref;
> -		refs = 1;
> -	}
> -	mtx_leave(&m_extref_mtx);
> +	MUTEX_ASSERT_LOCKED(&m_extref_mtx);
> +
> +	m->m_ext.ext_nextref->m_ext.ext_prevref = m->m_ext.ext_prevref;
> +	m->m_ext.ext_prevref->m_ext.ext_nextref = m->m_ext.ext_nextref;
>  
> -	return (refs);
> +	return (1);
>  }
>  
>  /*
> @@ -555,8 +570,15 @@ m_defrag(struct mbuf *m, int how)
>  	/* free chain behind and possible ext buf on the first mbuf */
>  	m_freem(m->m_next);
>  	m->m_next = NULL;
> -	if (m->m_flags & M_EXT)
> +	if (m->m_flags & M_EXT) {
> +		int lock = MCLISREFERENCED(m);
> +
> +		if (lock)
> +			mtx_enter(&m_extref_mtx);
>  		m_extfree(m);
> +		if (lock)
> +			mtx_leave(&m_extref_mtx);
> +	}
>  
>  	/*
>  	 * Bounce copy mbuf over to the original mbuf and set everything up.
> Index: sys/malloc.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/malloc.h,v
> diff -u -p -r1.127 malloc.h
> --- sys/malloc.h	5 Feb 2025 18:29:17 -0000	1.127
> +++ sys/malloc.h	30 Dec 2025 21:47:58 -0000
> @@ -65,7 +65,7 @@
>  #define	M_FREE		0	/* should be on free list */
>  /* 1 - free */
>  #define	M_DEVBUF	2	/* device driver memory */
> -/* 3 - free */
> +#define M_MBUF		3	/* mbuf */
>  #define	M_PCB		4	/* protocol control blocks */
>  #define	M_RTABLE	5	/* routing tables */
>  #define	M_PF		6	/* packet filter structures */
> 

-- 
:wq Claudio