Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: use softnet for socket splicing
To:
tech@openbsd.org
Date:
Fri, 19 Sep 2025 00:22:21 +0200

Download raw body.

Thread
  • Alexander Bluhm:

    use softnet for socket splicing

  • On Thu, Sep 18, 2025 at 02:49:46PM +0200, Alexander Bluhm wrote:
    > On Fri, Jul 25, 2025 at 02:07:57PM +0200, Alexander Bluhm wrote:
    > > Hi,
    > > 
    > > Currently socket splicing runs on one dedicated kernel thread.  This
    > > design is from a time when softnet was still a soft interrupt.
    > > 
    > > Now with multiple softnet threads, I want to retire the sosplice
    > > thread.  Instead call sotask() with the softnet task queue.  For
    > > that I have to pass the queue down in struct netstack.  Basically
    > > sorwakeup() and sowwakeup() get an additional argument.  If netstack
    > > and softnet are available, I use this specific tasks queue.  Otherwise
    > > softnet thread 0 is sufficient.  The hot path receiving packets
    > > will distribute them over all softnet threads.
    > > 
    > > Keeping the same softnet means that we take a bunch of packets from
    > > the network driver, do input processing, store them in socket
    > > buffers.  Then the same thread handles the splicing task, calls
    > > somove() and does output processing.  There is no concurrent locking
    > > or scheduling, ideally packets stay on the same CPU.  Before I had
    > > a yield() in sotask() to allow accumulation of packets.  With the
    > > new design this is no longer necessary.
    > > 
    > > As we run on softnet task queue and add splice tasks there, task
    > > barrier causes deadlock.  I replaced them with reference count in
    > > task_add(), task_del(), and sotask().
    > > 
    > > Depending on the test, it improves TCP thoughput between 3% and
    > > 20%.  For UDP I see 30% to 75% increase.
    > 
    > I have a new diff that avoids using netstack in sowakeup.  When
    > splicing, each PCB must have a flowid.  We can use that to find a
    > suitable softnet for the splicing task.
    > 
    > In my setup throughput increases from 30 Gbit to 40 Gbit per second.
    
    A NULL check was missing in in_flowid().  Updated diff.
    
    ok?
    
    bluhm
    
    Index: kern/uipc_socket.c
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
    diff -u -p -r1.385 uipc_socket.c
    --- kern/uipc_socket.c	25 Jul 2025 08:58:44 -0000	1.385
    +++ kern/uipc_socket.c	17 Sep 2025 22:21:19 -0000
    @@ -52,6 +52,9 @@
     #include <sys/time.h>
     #include <sys/refcnt.h>
     
    +#include <net/if.h>
    +#include <net/if_var.h>
    +
     #ifdef DDB
     #include <machine/db_machdep.h>
     #endif
    @@ -117,14 +120,13 @@ int	sominconn = SOMINCONN;
     struct pool socket_pool;
     #ifdef SOCKET_SPLICE
     struct pool sosplice_pool;
    -struct taskq *sosplice_taskq;
    -struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
     
     #define so_splicelen	so_sp->ssp_len
     #define so_splicemax	so_sp->ssp_max
     #define so_spliceidletv	so_sp->ssp_idletv
     #define so_spliceidleto	so_sp->ssp_idleto
     #define so_splicetask	so_sp->ssp_task
    +#define so_splicequeue	so_sp->ssp_queue
     #endif
     
     void
    @@ -433,6 +435,7 @@ discard:
     #ifdef SOCKET_SPLICE
     	if (so->so_sp) {
     		struct socket *soback;
    +		struct taskq *spq, *spqback;
     
     		sounlock_shared(so);
     		/*
    @@ -461,7 +464,6 @@ discard:
     			sounsplice(so->so_sp->ssp_soback, so, freeing);
     		}
     		sbunlock(&soback->so_rcv);
    -		sorele(soback);
     
     notsplicedback:
     		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
    @@ -474,9 +476,16 @@ notsplicedback:
     		}
     		sbunlock(&so->so_rcv);
     
    -		timeout_del_barrier(&so->so_spliceidleto);
    -		task_del(sosplice_taskq, &so->so_splicetask);
    -		taskq_barrier(sosplice_taskq);
    +		timeout_barrier(&so->so_spliceidleto);
    +		spq = READ_ONCE(so->so_splicequeue);
    +		if (spq != NULL)
    +			taskq_barrier(spq);
    +		if (soback != NULL) {
    +			spqback = READ_ONCE(soback->so_splicequeue);
    +			if (spqback != NULL && spqback != spq)
    +				taskq_barrier(spqback);
    +			sorele(soback);
    +		}
     
     		solock_shared(so);
     	}
    @@ -1290,7 +1299,6 @@ sosplice(struct socket *so, int fd, off_
     {
     	struct file	*fp;
     	struct socket	*sosp;
    -	struct taskq	*tq;
     	int		 error = 0;
     
     	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
    @@ -1312,25 +1320,6 @@ sosplice(struct socket *so, int fd, off_
     		return (error);
     	}
     
    -	if (sosplice_taskq == NULL) {
    -		rw_enter_write(&sosplice_lock);
    -		if (sosplice_taskq == NULL) {
    -			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
    -			    TASKQ_MPSAFE);
    -			if (tq == NULL) {
    -				rw_exit_write(&sosplice_lock);
    -				return (ENOMEM);
    -			}
    -			/* Ensure the taskq is fully visible to other CPUs. */
    -			membar_producer();
    -			sosplice_taskq = tq;
    -		}
    -		rw_exit_write(&sosplice_lock);
    -	} else {
    -		/* Ensure the taskq is fully visible on this CPU. */
    -		membar_consumer();
    -	}
    -
     	/* Find sosp, the drain socket where data will be spliced into. */
     	if ((error = getsock(curproc, fd, &fp)) != 0)
     		return (error);
    @@ -1435,15 +1424,15 @@ sounsplice(struct socket *so, struct soc
     	mtx_enter(&sosp->so_snd.sb_mtx);
     	so->so_rcv.sb_flags &= ~SB_SPLICE;
     	sosp->so_snd.sb_flags &= ~SB_SPLICE;
    +	timeout_del(&so->so_spliceidleto);
    +	if (so->so_splicequeue != NULL)
    +		task_del(so->so_splicequeue, &so->so_splicetask);
     	KASSERT(so->so_sp->ssp_socket == sosp);
     	KASSERT(sosp->so_sp->ssp_soback == so);
     	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
     	mtx_leave(&sosp->so_snd.sb_mtx);
     	mtx_leave(&so->so_rcv.sb_mtx);
     
    -	task_del(sosplice_taskq, &so->so_splicetask);
    -	timeout_del(&so->so_spliceidleto);
    -
     	/* Do not wakeup a socket that is about to be freed. */
     	if ((freeing & SOSP_FREEING_READ) == 0) {
     		int readable;
    @@ -1484,20 +1473,11 @@ void
     sotask(void *arg)
     {
     	struct socket *so = arg;
    -	int doyield = 0;
     
     	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
    -	if (so->so_rcv.sb_flags & SB_SPLICE) {
    -		if (so->so_proto->pr_flags & PR_WANTRCVD)
    -			doyield = 1;
    +	if (so->so_rcv.sb_flags & SB_SPLICE)
     		somove(so, M_DONTWAIT);
    -	}
     	sbunlock(&so->so_rcv);
    -
    -	if (doyield) {
    -		/* Avoid user land starvation. */
    -		yield();
    -	}
     }
     
     /*
    @@ -1858,8 +1838,11 @@ sorwakeup(struct socket *so)
     #ifdef SOCKET_SPLICE
     	if (so->so_proto->pr_flags & PR_SPLICE) {
     		mtx_enter(&so->so_rcv.sb_mtx);
    -		if (so->so_rcv.sb_flags & SB_SPLICE)
    -			task_add(sosplice_taskq, &so->so_splicetask);
    +		if (so->so_rcv.sb_flags & SB_SPLICE) {
    +			atomic_cas_ptr(&so->so_splicequeue, NULL,
    +			    net_tq(pru_flowid(so)));
    +			task_add(so->so_splicequeue, &so->so_splicetask);
    +		}
     		if (isspliced(so)) {
     			mtx_leave(&so->so_rcv.sb_mtx);
     			return;
    @@ -1878,9 +1861,14 @@ sowwakeup(struct socket *so)
     #ifdef SOCKET_SPLICE
     	if (so->so_proto->pr_flags & PR_SPLICE) {
     		mtx_enter(&so->so_snd.sb_mtx);
    -		if (so->so_snd.sb_flags & SB_SPLICE)
    -			task_add(sosplice_taskq,
    -			    &so->so_sp->ssp_soback->so_splicetask);
    +		if (so->so_snd.sb_flags & SB_SPLICE) {
    +			struct socket *soback = so->so_sp->ssp_soback;
    +
    +			atomic_cas_ptr(&soback->so_splicequeue, NULL,
    +			    net_tq(pru_flowid(soback)));
    +			task_add(soback->so_splicequeue,
    +			    &soback->so_splicetask);
    +		}
     		if (issplicedback(so)) {
     			mtx_leave(&so->so_snd.sb_mtx);
     			return;
    Index: netinet/in_pcb.c
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
    diff -u -p -r1.320 in_pcb.c
    --- netinet/in_pcb.c	14 Jul 2025 21:53:46 -0000	1.320
    +++ netinet/in_pcb.c	18 Sep 2025 16:53:42 -0000
    @@ -779,6 +779,17 @@ in_peeraddr(struct socket *so, struct mb
     	return (0);
     }
     
    +int
    +in_flowid(struct socket *so)
    +{
    +	struct inpcb *inp;
    +
    +	inp = sotoinpcb(so);
    +	if (inp == NULL)
    +		return (0);
    +	return (inp->inp_flowid);
    +}
    +
     /*
      * Pass some notification to all connections of a protocol
      * associated with address dst.  The "usual action" will be
    Index: netinet/in_pcb.h
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
    diff -u -p -r1.171 in_pcb.h
    --- netinet/in_pcb.h	14 Jul 2025 09:01:52 -0000	1.171
    +++ netinet/in_pcb.h	17 Sep 2025 22:02:42 -0000
    @@ -357,6 +357,7 @@ void	 in_setpeeraddr(struct inpcb *, str
     void	 in_setsockaddr(struct inpcb *, struct mbuf *);
     int	 in_sockaddr(struct socket *, struct mbuf *);
     int	 in_peeraddr(struct socket *, struct mbuf *);
    +int	 in_flowid(struct socket *);
     int	 in_baddynamic(u_int16_t, u_int16_t);
     int	 in_rootonly(u_int16_t, u_int16_t);
     int	 in_pcbselsrc(struct in_addr *, const struct sockaddr_in *,
    Index: netinet/tcp_usrreq.c
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
    diff -u -p -r1.252 tcp_usrreq.c
    --- netinet/tcp_usrreq.c	8 Jul 2025 00:47:41 -0000	1.252
    +++ netinet/tcp_usrreq.c	17 Sep 2025 22:02:42 -0000
    @@ -131,6 +131,7 @@ const struct pr_usrreqs tcp_usrreqs = {
     	.pru_control	= in_control,
     	.pru_sockaddr	= tcp_sockaddr,
     	.pru_peeraddr	= tcp_peeraddr,
    +	.pru_flowid	= in_flowid,
     };
     
     #ifdef INET6
    @@ -152,6 +153,7 @@ const struct pr_usrreqs tcp6_usrreqs = {
     	.pru_control	= in6_control,
     	.pru_sockaddr	= tcp_sockaddr,
     	.pru_peeraddr	= tcp_peeraddr,
    +	.pru_flowid	= in_flowid,
     };
     #endif
     
    Index: netinet/udp_usrreq.c
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
    diff -u -p -r1.349 udp_usrreq.c
    --- netinet/udp_usrreq.c	18 Jul 2025 08:39:14 -0000	1.349
    +++ netinet/udp_usrreq.c	17 Sep 2025 22:02:42 -0000
    @@ -136,6 +136,7 @@ const struct pr_usrreqs udp_usrreqs = {
     	.pru_control	= in_control,
     	.pru_sockaddr	= in_sockaddr,
     	.pru_peeraddr	= in_peeraddr,
    +	.pru_flowid	= in_flowid,
     };
     
     #ifdef INET6
    @@ -150,6 +151,7 @@ const struct pr_usrreqs udp6_usrreqs = {
     	.pru_control	= in6_control,
     	.pru_sockaddr	= in6_sockaddr,
     	.pru_peeraddr	= in6_peeraddr,
    +	.pru_flowid	= in_flowid,
     };
     #endif
     
    Index: sys/protosw.h
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
    diff -u -p -r1.72 protosw.h
    --- sys/protosw.h	2 Mar 2025 21:28:32 -0000	1.72
    +++ sys/protosw.h	17 Sep 2025 22:02:42 -0000
    @@ -86,6 +86,7 @@ struct pr_usrreqs {
     		    struct mbuf *);
     	int	(*pru_sockaddr)(struct socket *, struct mbuf *);
     	int	(*pru_peeraddr)(struct socket *, struct mbuf *);
    +	int	(*pru_flowid)(struct socket *);
     	int	(*pru_connect2)(struct socket *, struct socket *);
     };
     
    @@ -392,6 +393,12 @@ static inline int
     pru_peeraddr(struct socket *so, struct mbuf *addr)
     {
     	return (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, addr);
    +}
    +
    +static inline int
    +pru_flowid(struct socket *so)
    +{
    +	return (*so->so_proto->pr_usrreqs->pru_flowid)(so);
     }
     
     static inline int
    Index: sys/socketvar.h
    ===================================================================
    RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
    diff -u -p -r1.159 socketvar.h
    --- sys/socketvar.h	25 Jul 2025 08:58:44 -0000	1.159
    +++ sys/socketvar.h	17 Sep 2025 22:02:42 -0000
    @@ -74,6 +74,7 @@ struct sosplice {
     	struct	timeval ssp_idletv;	/* [I] idle timeout */
     	struct	timeout ssp_idleto;
     	struct	task ssp_task;		/* task for somove */
    +	struct	taskq *ssp_queue;	/* [a] softnet queue where we add */
     };
     
     /*
    
    
  • Alexander Bluhm:

    use softnet for socket splicing