Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: use softnet for socket splicing
To:
tech@openbsd.org
Date:
Thu, 18 Sep 2025 14:49:46 +0200

Download raw body.

Thread
On Fri, Jul 25, 2025 at 02:07:57PM +0200, Alexander Bluhm wrote:
> Hi,
> 
> Currently socket splicing runs on one dedicated kernel thread.  This
> design is from a time when softnet was still a soft interrupt.
> 
> Now with multiple softnet threads, I want to retire the sosplice
> thread.  Instead call sotask() with the softnet task queue.  For
> that I have to pass the queue down in struct netstack.  Basically
> sorwakeup() and sowwakeup() get an additional argument.  If netstack
> and softnet are available, I use this specific tasks queue.  Otherwise
> softnet thread 0 is sufficient.  The hot path receiving packets
> will distribute them over all softnet threads.
> 
> Keeping the same softnet means that we take a bunch of packets from
> the network driver, do input processing, store them in socket
> buffers.  Then the same thread handles the splicing task, calls
> somove() and does output processing.  There is no concurrent locking
> or scheduling, ideally packets stay on the same CPU.  Before I had
> a yield() in sotask() to allow accumulation of packets.  With the
> new design this is no longer necessary.
> 
> As we run on softnet task queue and add splice tasks there, task
> barrier causes deadlock.  I replaced them with reference count in
> task_add(), task_del(), and sotask().
> 
> Depending on the test, it improves TCP thoughput between 3% and
> 20%.  For UDP I see 30% to 75% increase.

I have a new diff that avoids using netstack in sowakeup.  When
splicing, each PCB must have a flowid.  We can use that to find a
suitable softnet for the splicing task.

In my setup throughput increases from 30 Gbit to 40 Gbit per second.

ok?

bluhm

Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.385 uipc_socket.c
--- kern/uipc_socket.c	25 Jul 2025 08:58:44 -0000	1.385
+++ kern/uipc_socket.c	17 Sep 2025 22:21:19 -0000
@@ -52,6 +52,9 @@
 #include <sys/time.h>
 #include <sys/refcnt.h>
 
+#include <net/if.h>
+#include <net/if_var.h>
+
 #ifdef DDB
 #include <machine/db_machdep.h>
 #endif
@@ -117,14 +120,13 @@ int	sominconn = SOMINCONN;
 struct pool socket_pool;
 #ifdef SOCKET_SPLICE
 struct pool sosplice_pool;
-struct taskq *sosplice_taskq;
-struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
 
 #define so_splicelen	so_sp->ssp_len
 #define so_splicemax	so_sp->ssp_max
 #define so_spliceidletv	so_sp->ssp_idletv
 #define so_spliceidleto	so_sp->ssp_idleto
 #define so_splicetask	so_sp->ssp_task
+#define so_splicequeue	so_sp->ssp_queue
 #endif
 
 void
@@ -433,6 +435,7 @@ discard:
 #ifdef SOCKET_SPLICE
 	if (so->so_sp) {
 		struct socket *soback;
+		struct taskq *spq, *spqback;
 
 		sounlock_shared(so);
 		/*
@@ -461,7 +464,6 @@ discard:
 			sounsplice(so->so_sp->ssp_soback, so, freeing);
 		}
 		sbunlock(&soback->so_rcv);
-		sorele(soback);
 
 notsplicedback:
 		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
@@ -474,9 +476,16 @@ notsplicedback:
 		}
 		sbunlock(&so->so_rcv);
 
-		timeout_del_barrier(&so->so_spliceidleto);
-		task_del(sosplice_taskq, &so->so_splicetask);
-		taskq_barrier(sosplice_taskq);
+		timeout_barrier(&so->so_spliceidleto);
+		spq = READ_ONCE(so->so_splicequeue);
+		if (spq != NULL)
+			taskq_barrier(spq);
+		if (soback != NULL) {
+			spqback = READ_ONCE(soback->so_splicequeue);
+			if (spqback != NULL && spqback != spq)
+				taskq_barrier(spqback);
+			sorele(soback);
+		}
 
 		solock_shared(so);
 	}
@@ -1290,7 +1299,6 @@ sosplice(struct socket *so, int fd, off_
 {
 	struct file	*fp;
 	struct socket	*sosp;
-	struct taskq	*tq;
 	int		 error = 0;
 
 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
@@ -1312,25 +1320,6 @@ sosplice(struct socket *so, int fd, off_
 		return (error);
 	}
 
-	if (sosplice_taskq == NULL) {
-		rw_enter_write(&sosplice_lock);
-		if (sosplice_taskq == NULL) {
-			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
-			    TASKQ_MPSAFE);
-			if (tq == NULL) {
-				rw_exit_write(&sosplice_lock);
-				return (ENOMEM);
-			}
-			/* Ensure the taskq is fully visible to other CPUs. */
-			membar_producer();
-			sosplice_taskq = tq;
-		}
-		rw_exit_write(&sosplice_lock);
-	} else {
-		/* Ensure the taskq is fully visible on this CPU. */
-		membar_consumer();
-	}
-
 	/* Find sosp, the drain socket where data will be spliced into. */
 	if ((error = getsock(curproc, fd, &fp)) != 0)
 		return (error);
@@ -1435,15 +1424,15 @@ sounsplice(struct socket *so, struct soc
 	mtx_enter(&sosp->so_snd.sb_mtx);
 	so->so_rcv.sb_flags &= ~SB_SPLICE;
 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
+	timeout_del(&so->so_spliceidleto);
+	if (so->so_splicequeue != NULL)
+		task_del(so->so_splicequeue, &so->so_splicetask);
 	KASSERT(so->so_sp->ssp_socket == sosp);
 	KASSERT(sosp->so_sp->ssp_soback == so);
 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
 	mtx_leave(&sosp->so_snd.sb_mtx);
 	mtx_leave(&so->so_rcv.sb_mtx);
 
-	task_del(sosplice_taskq, &so->so_splicetask);
-	timeout_del(&so->so_spliceidleto);
-
 	/* Do not wakeup a socket that is about to be freed. */
 	if ((freeing & SOSP_FREEING_READ) == 0) {
 		int readable;
@@ -1484,20 +1473,11 @@ void
 sotask(void *arg)
 {
 	struct socket *so = arg;
-	int doyield = 0;
 
 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
-	if (so->so_rcv.sb_flags & SB_SPLICE) {
-		if (so->so_proto->pr_flags & PR_WANTRCVD)
-			doyield = 1;
+	if (so->so_rcv.sb_flags & SB_SPLICE)
 		somove(so, M_DONTWAIT);
-	}
 	sbunlock(&so->so_rcv);
-
-	if (doyield) {
-		/* Avoid user land starvation. */
-		yield();
-	}
 }
 
 /*
@@ -1858,8 +1838,11 @@ sorwakeup(struct socket *so)
 #ifdef SOCKET_SPLICE
 	if (so->so_proto->pr_flags & PR_SPLICE) {
 		mtx_enter(&so->so_rcv.sb_mtx);
-		if (so->so_rcv.sb_flags & SB_SPLICE)
-			task_add(sosplice_taskq, &so->so_splicetask);
+		if (so->so_rcv.sb_flags & SB_SPLICE) {
+			atomic_cas_ptr(&so->so_splicequeue, NULL,
+			    net_tq(pru_flowid(so)));
+			task_add(so->so_splicequeue, &so->so_splicetask);
+		}
 		if (isspliced(so)) {
 			mtx_leave(&so->so_rcv.sb_mtx);
 			return;
@@ -1878,9 +1861,14 @@ sowwakeup(struct socket *so)
 #ifdef SOCKET_SPLICE
 	if (so->so_proto->pr_flags & PR_SPLICE) {
 		mtx_enter(&so->so_snd.sb_mtx);
-		if (so->so_snd.sb_flags & SB_SPLICE)
-			task_add(sosplice_taskq,
-			    &so->so_sp->ssp_soback->so_splicetask);
+		if (so->so_snd.sb_flags & SB_SPLICE) {
+			struct socket *soback = so->so_sp->ssp_soback;
+
+			atomic_cas_ptr(&soback->so_splicequeue, NULL,
+			    net_tq(pru_flowid(soback)));
+			task_add(soback->so_splicequeue,
+			    &soback->so_splicetask);
+		}
 		if (issplicedback(so)) {
 			mtx_leave(&so->so_snd.sb_mtx);
 			return;
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.320 in_pcb.c
--- netinet/in_pcb.c	14 Jul 2025 21:53:46 -0000	1.320
+++ netinet/in_pcb.c	17 Sep 2025 22:02:42 -0000
@@ -779,6 +779,12 @@ in_peeraddr(struct socket *so, struct mb
 	return (0);
 }
 
+int
+in_flowid(struct socket *so)
+{
+	return (sotoinpcb(so)->inp_flowid);
+}
+
 /*
  * Pass some notification to all connections of a protocol
  * associated with address dst.  The "usual action" will be
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.171 in_pcb.h
--- netinet/in_pcb.h	14 Jul 2025 09:01:52 -0000	1.171
+++ netinet/in_pcb.h	17 Sep 2025 22:02:42 -0000
@@ -357,6 +357,7 @@ void	 in_setpeeraddr(struct inpcb *, str
 void	 in_setsockaddr(struct inpcb *, struct mbuf *);
 int	 in_sockaddr(struct socket *, struct mbuf *);
 int	 in_peeraddr(struct socket *, struct mbuf *);
+int	 in_flowid(struct socket *);
 int	 in_baddynamic(u_int16_t, u_int16_t);
 int	 in_rootonly(u_int16_t, u_int16_t);
 int	 in_pcbselsrc(struct in_addr *, const struct sockaddr_in *,
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.252 tcp_usrreq.c
--- netinet/tcp_usrreq.c	8 Jul 2025 00:47:41 -0000	1.252
+++ netinet/tcp_usrreq.c	17 Sep 2025 22:02:42 -0000
@@ -131,6 +131,7 @@ const struct pr_usrreqs tcp_usrreqs = {
 	.pru_control	= in_control,
 	.pru_sockaddr	= tcp_sockaddr,
 	.pru_peeraddr	= tcp_peeraddr,
+	.pru_flowid	= in_flowid,
 };
 
 #ifdef INET6
@@ -152,6 +153,7 @@ const struct pr_usrreqs tcp6_usrreqs = {
 	.pru_control	= in6_control,
 	.pru_sockaddr	= tcp_sockaddr,
 	.pru_peeraddr	= tcp_peeraddr,
+	.pru_flowid	= in_flowid,
 };
 #endif
 
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
diff -u -p -r1.349 udp_usrreq.c
--- netinet/udp_usrreq.c	18 Jul 2025 08:39:14 -0000	1.349
+++ netinet/udp_usrreq.c	17 Sep 2025 22:02:42 -0000
@@ -136,6 +136,7 @@ const struct pr_usrreqs udp_usrreqs = {
 	.pru_control	= in_control,
 	.pru_sockaddr	= in_sockaddr,
 	.pru_peeraddr	= in_peeraddr,
+	.pru_flowid	= in_flowid,
 };
 
 #ifdef INET6
@@ -150,6 +151,7 @@ const struct pr_usrreqs udp6_usrreqs = {
 	.pru_control	= in6_control,
 	.pru_sockaddr	= in6_sockaddr,
 	.pru_peeraddr	= in6_peeraddr,
+	.pru_flowid	= in_flowid,
 };
 #endif
 
Index: sys/protosw.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
diff -u -p -r1.72 protosw.h
--- sys/protosw.h	2 Mar 2025 21:28:32 -0000	1.72
+++ sys/protosw.h	17 Sep 2025 22:02:42 -0000
@@ -86,6 +86,7 @@ struct pr_usrreqs {
 		    struct mbuf *);
 	int	(*pru_sockaddr)(struct socket *, struct mbuf *);
 	int	(*pru_peeraddr)(struct socket *, struct mbuf *);
+	int	(*pru_flowid)(struct socket *);
 	int	(*pru_connect2)(struct socket *, struct socket *);
 };
 
@@ -392,6 +393,12 @@ static inline int
 pru_peeraddr(struct socket *so, struct mbuf *addr)
 {
 	return (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, addr);
+}
+
+static inline int
+pru_flowid(struct socket *so)
+{
+	return (*so->so_proto->pr_usrreqs->pru_flowid)(so);
 }
 
 static inline int
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.159 socketvar.h
--- sys/socketvar.h	25 Jul 2025 08:58:44 -0000	1.159
+++ sys/socketvar.h	17 Sep 2025 22:02:42 -0000
@@ -74,6 +74,7 @@ struct sosplice {
 	struct	timeval ssp_idletv;	/* [I] idle timeout */
 	struct	timeout ssp_idleto;
 	struct	task ssp_task;		/* task for somove */
+	struct	taskq *ssp_queue;	/* [a] softnet queue where we add */
 };
 
 /*