Download raw body.
running TCP input in parallel
Hi,
To run tcp_input() in parallel efficently, we have to lock the
socket in a smart way. I have measured multiple variants.
http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/perform.html
The relevant TCP graph is here.
http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp.html
http://bluhm.genua.de/perform/results/2025-04-16T09:33:58Z/gnuplot/tcp6.html
First column (left) is no locking at all, just exclusive net lock.
Third column is socket lock in addition to exclusive net lock. You
see 6% degradation due to locking overhead. This has already been
commited.
Fourth column (right) is simple switch from exclusive net lock to
shared net lock and relying on socket lock. Grabbing the socket
lock for each packet is expensive. Especially tcp_input() and
soreceive() are fighting for the lock. Single stream performance
goes down by 25%, but multi stream goes up by 140%.
The second column contains the diff below. The idea is that
tcp_input() moves all TCP packets from softnet input queue into TCP
input queue. This queue has storage per softnet thread and can be
accessed without lock. After running all protocol input functions,
but in the same shared netlock context, process the TCP input queue.
tcp_input_mlist() keeps a pointer to the current socket.
tcp_input_solocked() tries to keep the lock on the socket. If
consecutive TCP packets belong to the same socket, the lock is not
released. Only when the TCP stream changes, we unlock the old and
lock a new socket. This batch processing of locked sockets gives
5% increase in single stream and 160% for multi stream throughput.
I have several positive test reports.
ok?
bluhm
Index: net/if.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
diff -u -p -r1.729 if.c
--- net/if.c 19 Mar 2025 23:29:49 -0000 1.729
+++ net/if.c 16 Apr 2025 14:49:20 -0000
@@ -1001,10 +1001,21 @@ if_input_process(struct ifnet *ifp, stru
*/
sn = net_sn(idx);
+ ml_init(&sn->sn_netstack.ns_tcp_ml);
+#ifdef INET6
+ ml_init(&sn->sn_netstack.ns_tcp6_ml);
+#endif
NET_LOCK_SHARED();
+
while ((m = ml_dequeue(ml)) != NULL)
(*ifp->if_input)(ifp, m, &sn->sn_netstack);
+
+ tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET);
+#ifdef INET6
+ tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6);
+#endif
+
NET_UNLOCK_SHARED();
}
Index: net/if_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
diff -u -p -r1.136 if_var.h
--- net/if_var.h 2 Mar 2025 21:28:32 -0000 1.136
+++ net/if_var.h 16 Apr 2025 14:49:20 -0000
@@ -92,7 +92,9 @@ struct task;
struct cpumem;
struct netstack {
- struct route ns_route;
+ struct route ns_route;
+ struct mbuf_list ns_tcp_ml;
+ struct mbuf_list ns_tcp6_ml;
};
/*
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
diff -u -p -r1.121 in_proto.c
--- netinet/in_proto.c 5 Jan 2025 12:36:48 -0000 1.121
+++ netinet/in_proto.c 16 Apr 2025 14:49:20 -0000
@@ -197,7 +197,8 @@ const struct protosw inetsw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
+ PR_MPINPUT,
.pr_input = tcp_input,
.pr_ctlinput = tcp_ctlinput,
.pr_ctloutput = tcp_ctloutput,
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.435 tcp_input.c
--- netinet/tcp_input.c 16 Apr 2025 12:51:11 -0000 1.435
+++ netinet/tcp_input.c 16 Apr 2025 14:49:20 -0000
@@ -100,9 +100,6 @@
#include <net/pfvar.h>
#endif
-int tcp_mss_adv(struct rtentry *, int);
-int tcp_flush_queue(struct tcpcb *);
-
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
@@ -177,6 +174,9 @@ do { \
if_put(ifp); \
} while (0)
+int tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **);
+int tcp_mss_adv(struct rtentry *, int);
+int tcp_flush_queue(struct tcpcb *);
void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
@@ -347,12 +347,53 @@ tcp_flush_queue(struct tcpcb *tp)
return (flags);
}
+int
+tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
+{
+ if (ns == NULL)
+ return tcp_input_solocked(mp, offp, proto, af, NULL);
+ (*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp);
+ switch (af) {
+ case AF_INET:
+ ml_enqueue(&ns->ns_tcp_ml, *mp);
+ break;
+#ifdef INET6
+ case AF_INET6:
+ ml_enqueue(&ns->ns_tcp6_ml, *mp);
+ break;
+#endif
+ default:
+ m_freemp(mp);
+ }
+ *mp = NULL;
+ return IPPROTO_DONE;
+}
+
+void
+tcp_input_mlist(struct mbuf_list *ml, int af)
+{
+ struct socket *so = NULL;
+ struct mbuf *m;
+
+ while ((m = ml_dequeue(ml)) != NULL) {
+ int off, nxt;
+
+ off = (long)m->m_pkthdr.ph_cookie;
+ m->m_pkthdr.ph_cookie = NULL;
+ nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so);
+ KASSERT(nxt == IPPROTO_DONE);
+ }
+
+ in_pcbsounlock_rele(NULL, so);
+}
+
/*
* TCP input routine, follows pages 65-76 of the
* protocol specification dated September, 1981 very closely.
*/
int
-tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
+tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af,
+ struct socket ** solocked)
{
struct mbuf *m = *mp;
int iphlen = *offp;
@@ -603,7 +644,21 @@ findpcb:
tcpstat_inc(tcps_noport);
goto dropwithreset_ratelim;
}
- so = in_pcbsolock_ref(inp);
+ /*
+ * Avoid needless lock and unlock operation when handling multiple
+ * TCP packets from the same stream consecutively.
+ */
+ if (solocked != NULL && *solocked != NULL &&
+ sotoinpcb(*solocked) == inp) {
+ so = *solocked;
+ *solocked = NULL;
+ } else {
+ if (solocked != NULL && *solocked != NULL) {
+ in_pcbsounlock_rele(NULL, *solocked);
+ *solocked = NULL;
+ }
+ so = in_pcbsolock_ref(inp);
+ }
if (so == NULL) {
tcpstat_inc(tcps_noport);
goto dropwithreset_ratelim;
@@ -846,7 +901,10 @@ findpcb:
tcpstat_inc(tcps_dropsyn);
goto drop;
}
- in_pcbsounlock_rele(inp, so);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -1022,7 +1080,10 @@ findpcb:
if (so->so_snd.sb_cc ||
tp->t_flags & TF_NEEDOUTPUT)
(void) tcp_output(tp);
- in_pcbsounlock_rele(inp, so);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -1073,7 +1134,10 @@ findpcb:
tp->t_flags &= ~TF_BLOCKOUTPUT;
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
- in_pcbsounlock_rele(inp, so);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
}
@@ -2073,7 +2137,10 @@ dodata: /* XXX */
*/
if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
- in_pcbsounlock_rele(inp, so);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
@@ -2103,7 +2170,10 @@ dropafterack:
m_freem(m);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
- in_pcbsounlock_rele(inp, so);
+ if (solocked != NULL)
+ *solocked = so;
+ else
+ in_pcbsounlock_rele(inp, so);
in_pcbunref(inp);
return IPPROTO_DONE;
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.186 tcp_var.h
--- netinet/tcp_var.h 2 Mar 2025 21:28:32 -0000 1.186
+++ netinet/tcp_var.h 16 Apr 2025 14:49:20 -0000
@@ -718,6 +718,7 @@ int tcp_dooptions(struct tcpcb *, u_cha
struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t);
void tcp_init(void);
int tcp_input(struct mbuf **, int *, int, int, struct netstack *);
+void tcp_input_mlist(struct mbuf_list *, int);
int tcp_mss(struct tcpcb *, int);
void tcp_mss_update(struct tcpcb *);
u_int tcp_hdrsz(struct tcpcb *);
Index: netinet6/in6_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
diff -u -p -r1.124 in6_proto.c
--- netinet6/in6_proto.c 5 Jan 2025 12:36:48 -0000 1.124
+++ netinet6/in6_proto.c 16 Apr 2025 14:49:20 -0000
@@ -147,7 +147,8 @@ const struct protosw inet6sw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inet6domain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|
+ PR_MPINPUT,
.pr_input = tcp_input,
.pr_ctlinput = tcp6_ctlinput,
.pr_ctloutput = tcp_ctloutput,
running TCP input in parallel