Download raw body.
use softnet for socket splicing
On Fri, Jul 25, 2025 at 02:07:57PM +0200, Alexander Bluhm wrote:
> Hi,
>
> Currently socket splicing runs on one dedicated kernel thread. This
> design is from a time when softnet was still a soft interrupt.
>
> Now with multiple softnet threads, I want to retire the sosplice
> thread. Instead call sotask() with the softnet task queue. For
> that I have to pass the queue down in struct netstack. Basically
> sorwakeup() and sowwakeup() get an additional argument. If netstack
> and softnet are available, I use this specific tasks queue. Otherwise
> softnet thread 0 is sufficient. The hot path receiving packets
> will distribute them over all softnet threads.
>
> Keeping the same softnet means that we take a bunch of packets from
> the network driver, do input processing, store them in socket
> buffers. Then the same thread handles the splicing task, calls
> somove() and does output processing. There is no concurrent locking
> or scheduling, ideally packets stay on the same CPU. Before I had
> a yield() in sotask() to allow accumulation of packets. With the
> new design this is no longer necessary.
>
> As we run on softnet task queue and add splice tasks there, task
> barrier causes deadlock. I replaced them with reference count in
> task_add(), task_del(), and sotask().
>
> Depending on the test, it improves TCP thoughput between 3% and
> 20%. For UDP I see 30% to 75% increase.
I have a new diff that avoids using netstack in sowakeup. When
splicing, each PCB must have a flowid. We can use that to find a
suitable softnet for the splicing task.
In my setup throughput increases from 30 Gbit to 40 Gbit per second.
ok?
bluhm
Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.385 uipc_socket.c
--- kern/uipc_socket.c 25 Jul 2025 08:58:44 -0000 1.385
+++ kern/uipc_socket.c 17 Sep 2025 22:21:19 -0000
@@ -52,6 +52,9 @@
#include <sys/time.h>
#include <sys/refcnt.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
#ifdef DDB
#include <machine/db_machdep.h>
#endif
@@ -117,14 +120,13 @@ int sominconn = SOMINCONN;
struct pool socket_pool;
#ifdef SOCKET_SPLICE
struct pool sosplice_pool;
-struct taskq *sosplice_taskq;
-struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
#define so_splicelen so_sp->ssp_len
#define so_splicemax so_sp->ssp_max
#define so_spliceidletv so_sp->ssp_idletv
#define so_spliceidleto so_sp->ssp_idleto
#define so_splicetask so_sp->ssp_task
+#define so_splicequeue so_sp->ssp_queue
#endif
void
@@ -433,6 +435,7 @@ discard:
#ifdef SOCKET_SPLICE
if (so->so_sp) {
struct socket *soback;
+ struct taskq *spq, *spqback;
sounlock_shared(so);
/*
@@ -461,7 +464,6 @@ discard:
sounsplice(so->so_sp->ssp_soback, so, freeing);
}
sbunlock(&soback->so_rcv);
- sorele(soback);
notsplicedback:
sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
@@ -474,9 +476,16 @@ notsplicedback:
}
sbunlock(&so->so_rcv);
- timeout_del_barrier(&so->so_spliceidleto);
- task_del(sosplice_taskq, &so->so_splicetask);
- taskq_barrier(sosplice_taskq);
+ timeout_barrier(&so->so_spliceidleto);
+ spq = READ_ONCE(so->so_splicequeue);
+ if (spq != NULL)
+ taskq_barrier(spq);
+ if (soback != NULL) {
+ spqback = READ_ONCE(soback->so_splicequeue);
+ if (spqback != NULL && spqback != spq)
+ taskq_barrier(spqback);
+ sorele(soback);
+ }
solock_shared(so);
}
@@ -1290,7 +1299,6 @@ sosplice(struct socket *so, int fd, off_
{
struct file *fp;
struct socket *sosp;
- struct taskq *tq;
int error = 0;
if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
@@ -1312,25 +1320,6 @@ sosplice(struct socket *so, int fd, off_
return (error);
}
- if (sosplice_taskq == NULL) {
- rw_enter_write(&sosplice_lock);
- if (sosplice_taskq == NULL) {
- tq = taskq_create("sosplice", 1, IPL_SOFTNET,
- TASKQ_MPSAFE);
- if (tq == NULL) {
- rw_exit_write(&sosplice_lock);
- return (ENOMEM);
- }
- /* Ensure the taskq is fully visible to other CPUs. */
- membar_producer();
- sosplice_taskq = tq;
- }
- rw_exit_write(&sosplice_lock);
- } else {
- /* Ensure the taskq is fully visible on this CPU. */
- membar_consumer();
- }
-
/* Find sosp, the drain socket where data will be spliced into. */
if ((error = getsock(curproc, fd, &fp)) != 0)
return (error);
@@ -1435,15 +1424,15 @@ sounsplice(struct socket *so, struct soc
mtx_enter(&sosp->so_snd.sb_mtx);
so->so_rcv.sb_flags &= ~SB_SPLICE;
sosp->so_snd.sb_flags &= ~SB_SPLICE;
+ timeout_del(&so->so_spliceidleto);
+ if (so->so_splicequeue != NULL)
+ task_del(so->so_splicequeue, &so->so_splicetask);
KASSERT(so->so_sp->ssp_socket == sosp);
KASSERT(sosp->so_sp->ssp_soback == so);
so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
mtx_leave(&sosp->so_snd.sb_mtx);
mtx_leave(&so->so_rcv.sb_mtx);
- task_del(sosplice_taskq, &so->so_splicetask);
- timeout_del(&so->so_spliceidleto);
-
/* Do not wakeup a socket that is about to be freed. */
if ((freeing & SOSP_FREEING_READ) == 0) {
int readable;
@@ -1484,20 +1473,11 @@ void
sotask(void *arg)
{
struct socket *so = arg;
- int doyield = 0;
sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
- if (so->so_rcv.sb_flags & SB_SPLICE) {
- if (so->so_proto->pr_flags & PR_WANTRCVD)
- doyield = 1;
+ if (so->so_rcv.sb_flags & SB_SPLICE)
somove(so, M_DONTWAIT);
- }
sbunlock(&so->so_rcv);
-
- if (doyield) {
- /* Avoid user land starvation. */
- yield();
- }
}
/*
@@ -1858,8 +1838,11 @@ sorwakeup(struct socket *so)
#ifdef SOCKET_SPLICE
if (so->so_proto->pr_flags & PR_SPLICE) {
mtx_enter(&so->so_rcv.sb_mtx);
- if (so->so_rcv.sb_flags & SB_SPLICE)
- task_add(sosplice_taskq, &so->so_splicetask);
+ if (so->so_rcv.sb_flags & SB_SPLICE) {
+ atomic_cas_ptr(&so->so_splicequeue, NULL,
+ net_tq(pru_flowid(so)));
+ task_add(so->so_splicequeue, &so->so_splicetask);
+ }
if (isspliced(so)) {
mtx_leave(&so->so_rcv.sb_mtx);
return;
@@ -1878,9 +1861,14 @@ sowwakeup(struct socket *so)
#ifdef SOCKET_SPLICE
if (so->so_proto->pr_flags & PR_SPLICE) {
mtx_enter(&so->so_snd.sb_mtx);
- if (so->so_snd.sb_flags & SB_SPLICE)
- task_add(sosplice_taskq,
- &so->so_sp->ssp_soback->so_splicetask);
+ if (so->so_snd.sb_flags & SB_SPLICE) {
+ struct socket *soback = so->so_sp->ssp_soback;
+
+ atomic_cas_ptr(&soback->so_splicequeue, NULL,
+ net_tq(pru_flowid(soback)));
+ task_add(soback->so_splicequeue,
+ &soback->so_splicetask);
+ }
if (issplicedback(so)) {
mtx_leave(&so->so_snd.sb_mtx);
return;
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.320 in_pcb.c
--- netinet/in_pcb.c 14 Jul 2025 21:53:46 -0000 1.320
+++ netinet/in_pcb.c 17 Sep 2025 22:02:42 -0000
@@ -779,6 +779,12 @@ in_peeraddr(struct socket *so, struct mb
return (0);
}
+int
+in_flowid(struct socket *so)
+{
+ return (sotoinpcb(so)->inp_flowid);
+}
+
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The "usual action" will be
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.171 in_pcb.h
--- netinet/in_pcb.h 14 Jul 2025 09:01:52 -0000 1.171
+++ netinet/in_pcb.h 17 Sep 2025 22:02:42 -0000
@@ -357,6 +357,7 @@ void in_setpeeraddr(struct inpcb *, str
void in_setsockaddr(struct inpcb *, struct mbuf *);
int in_sockaddr(struct socket *, struct mbuf *);
int in_peeraddr(struct socket *, struct mbuf *);
+int in_flowid(struct socket *);
int in_baddynamic(u_int16_t, u_int16_t);
int in_rootonly(u_int16_t, u_int16_t);
int in_pcbselsrc(struct in_addr *, const struct sockaddr_in *,
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
diff -u -p -r1.252 tcp_usrreq.c
--- netinet/tcp_usrreq.c 8 Jul 2025 00:47:41 -0000 1.252
+++ netinet/tcp_usrreq.c 17 Sep 2025 22:02:42 -0000
@@ -131,6 +131,7 @@ const struct pr_usrreqs tcp_usrreqs = {
.pru_control = in_control,
.pru_sockaddr = tcp_sockaddr,
.pru_peeraddr = tcp_peeraddr,
+ .pru_flowid = in_flowid,
};
#ifdef INET6
@@ -152,6 +153,7 @@ const struct pr_usrreqs tcp6_usrreqs = {
.pru_control = in6_control,
.pru_sockaddr = tcp_sockaddr,
.pru_peeraddr = tcp_peeraddr,
+ .pru_flowid = in_flowid,
};
#endif
Index: netinet/udp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
diff -u -p -r1.349 udp_usrreq.c
--- netinet/udp_usrreq.c 18 Jul 2025 08:39:14 -0000 1.349
+++ netinet/udp_usrreq.c 17 Sep 2025 22:02:42 -0000
@@ -136,6 +136,7 @@ const struct pr_usrreqs udp_usrreqs = {
.pru_control = in_control,
.pru_sockaddr = in_sockaddr,
.pru_peeraddr = in_peeraddr,
+ .pru_flowid = in_flowid,
};
#ifdef INET6
@@ -150,6 +151,7 @@ const struct pr_usrreqs udp6_usrreqs = {
.pru_control = in6_control,
.pru_sockaddr = in6_sockaddr,
.pru_peeraddr = in6_peeraddr,
+ .pru_flowid = in_flowid,
};
#endif
Index: sys/protosw.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
diff -u -p -r1.72 protosw.h
--- sys/protosw.h 2 Mar 2025 21:28:32 -0000 1.72
+++ sys/protosw.h 17 Sep 2025 22:02:42 -0000
@@ -86,6 +86,7 @@ struct pr_usrreqs {
struct mbuf *);
int (*pru_sockaddr)(struct socket *, struct mbuf *);
int (*pru_peeraddr)(struct socket *, struct mbuf *);
+ int (*pru_flowid)(struct socket *);
int (*pru_connect2)(struct socket *, struct socket *);
};
@@ -392,6 +393,12 @@ static inline int
pru_peeraddr(struct socket *so, struct mbuf *addr)
{
return (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, addr);
+}
+
+static inline int
+pru_flowid(struct socket *so)
+{
+ return (*so->so_proto->pr_usrreqs->pru_flowid)(so);
}
static inline int
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.159 socketvar.h
--- sys/socketvar.h 25 Jul 2025 08:58:44 -0000 1.159
+++ sys/socketvar.h 17 Sep 2025 22:02:42 -0000
@@ -74,6 +74,7 @@ struct sosplice {
struct timeval ssp_idletv; /* [I] idle timeout */
struct timeout ssp_idleto;
struct task ssp_task; /* task for somove */
+ struct taskq *ssp_queue; /* [a] softnet queue where we add */
};
/*
use softnet for socket splicing