Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
TCP input socket lock caching
To:
tech@openbsd.org
Date:
Tue, 6 May 2025 22:00:30 +0200

Download raw body.

Thread
Hi,

Parallel TCP input is running for a few days now and looks quite
stable.

Final step is to implement caching of the socket lock.  Without
large receive offloading (LRO) in the driver layer, it is very
likely that consecutive TCP segments are in the input queue.  This
leads to contention of the socket lock between TCP input and socket
receive syscall from userland.

http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/2025-05-05T00%3A00%3A00Z/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter

With the diff below, ip_deliver() moves all TCP packets that are
in the softnet queue temporarily to a TCP queue.  This queue is per
softnet thread so no locking is needed.  Finally in the same shared
netlock context, tcp_input_mlist() processes all TCP packets.  It
keeps a pointer to the socket lock.  tcp_input_solocked() switches
the lock only when the TCP stream changes.  A bunch of packets are
processed and placed into the socket receive buffer under the same
lock.  Then soreceive() can copy huge chunks to userland.  The
contention of the socket lock is gone.

http://bluhm.genua.de/perform/results/2025-05-05T00%3A52%3A28Z/patch-sys-tcp-input-parallel.0/btrace/iperf3_-6_-cfdd7%3Ae83e%3A66bc%3A0345%3A%3A35_-w1m_-t10_-R-btrace-kstack.0.svg?s=rw_do_enter

On a 4 core machine I see between 12% to 22% improvement with 10
parallel TCP streams.  When testing only with a single TCP stream,
throughput increases between 38% to 100%.

I have sent out this diff before as part of larger unlocking.

ok?

bluhm

Index: net/if.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
diff -u -p -r1.731 if.c
--- net/if.c	1 May 2025 11:19:46 -0000	1.731
+++ net/if.c	6 May 2025 19:21:02 -0000
@@ -999,10 +999,21 @@ if_input_process(struct ifnet *ifp, stru
 	 */
 
 	sn = net_sn(idx);
+	ml_init(&sn->sn_netstack.ns_tcp_ml);
+#ifdef INET6
+	ml_init(&sn->sn_netstack.ns_tcp6_ml);
+#endif
 
 	NET_LOCK_SHARED();
+
 	while ((m = ml_dequeue(ml)) != NULL)
 		(*ifp->if_input)(ifp, m, &sn->sn_netstack);
+
+	tcp_input_mlist(&sn->sn_netstack.ns_tcp_ml, AF_INET);
+#ifdef INET6
+	tcp_input_mlist(&sn->sn_netstack.ns_tcp6_ml, AF_INET6);
+#endif
+
 	NET_UNLOCK_SHARED();
 }
 
Index: net/if_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_var.h,v
diff -u -p -r1.136 if_var.h
--- net/if_var.h	2 Mar 2025 21:28:32 -0000	1.136
+++ net/if_var.h	6 May 2025 19:21:02 -0000
@@ -92,7 +92,9 @@ struct task;
 struct cpumem;
 
 struct netstack {
-	struct route	ns_route;
+	struct route		ns_route;
+	struct mbuf_list	ns_tcp_ml;
+	struct mbuf_list	ns_tcp6_ml;
 };
 
 /*
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.443 tcp_input.c
--- netinet/tcp_input.c	4 May 2025 23:05:17 -0000	1.443
+++ netinet/tcp_input.c	6 May 2025 19:21:02 -0000
@@ -101,9 +101,6 @@
 #include <net/pfvar.h>
 #endif
 
-int tcp_mss_adv(struct rtentry *, int);
-int tcp_flush_queue(struct tcpcb *);
-
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
@@ -178,6 +175,9 @@ do { \
 	if_put(ifp); \
 } while (0)
 
+int	 tcp_input_solocked(struct mbuf **, int *, int, int, struct socket **);
+int	 tcp_mss_adv(struct rtentry *, int);
+int	 tcp_flush_queue(struct tcpcb *);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
 
@@ -348,12 +348,53 @@ tcp_flush_queue(struct tcpcb *tp)
 	return (flags);
 }
 
+int
+tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
+{
+	if (ns == NULL)
+		return tcp_input_solocked(mp, offp, proto, af, NULL);
+	(*mp)->m_pkthdr.ph_cookie = (void *)(long)(*offp);
+	switch (af) {
+	case AF_INET:
+		ml_enqueue(&ns->ns_tcp_ml, *mp);
+		break;
+#ifdef INET6
+	case AF_INET6:
+		ml_enqueue(&ns->ns_tcp6_ml, *mp);
+		break;
+#endif
+	default:
+		m_freemp(mp);
+	}
+	*mp = NULL;
+	return IPPROTO_DONE;
+}
+
+void
+tcp_input_mlist(struct mbuf_list *ml, int af)
+{
+	struct socket *so = NULL;
+	struct mbuf *m;
+
+	while ((m = ml_dequeue(ml)) != NULL) {
+		int off, nxt;
+
+		off = (long)m->m_pkthdr.ph_cookie;
+		m->m_pkthdr.ph_cookie = NULL;
+		nxt = tcp_input_solocked(&m, &off, IPPROTO_TCP, af, &so);
+		KASSERT(nxt == IPPROTO_DONE);
+	}
+
+	in_pcbsounlock_rele(NULL, so);
+}
+
 /*
  * TCP input routine, follows pages 65-76 of the
  * protocol specification dated September, 1981 very closely.
  */
 int
-tcp_input(struct mbuf **mp, int *offp, int proto, int af, struct netstack *ns)
+tcp_input_solocked(struct mbuf **mp, int *offp, int proto, int af,
+    struct socket **solocked)
 {
 	struct mbuf *m = *mp;
 	int iphlen = *offp;
@@ -604,7 +645,21 @@ findpcb:
 		tcpstat_inc(tcps_noport);
 		goto dropwithreset_ratelim;
 	}
-	so = in_pcbsolock_ref(inp);
+	/*
+	 * Avoid needless lock and unlock operation when handling multiple
+	 * TCP packets from the same stream consecutively.
+	 */
+	if (solocked != NULL && *solocked != NULL &&
+	    sotoinpcb(*solocked) == inp) {
+		so = *solocked;
+		*solocked = NULL;
+	} else {
+		if (solocked != NULL && *solocked != NULL) {
+			in_pcbsounlock_rele(NULL, *solocked);
+			*solocked = NULL;
+		}
+		so = in_pcbsolock_ref(inp);
+	}
 	if (so == NULL) {
 		tcpstat_inc(tcps_noport);
 		goto dropwithreset_ratelim;
@@ -847,7 +902,10 @@ findpcb:
 					tcpstat_inc(tcps_dropsyn);
 					goto drop;
 				}
-				in_pcbsounlock_rele(inp, so);
+				if (solocked != NULL)
+					*solocked = so;
+				else
+					in_pcbsounlock_rele(inp, so);
 				in_pcbunref(inp);
 				return IPPROTO_DONE;
 			}
@@ -1023,7 +1081,10 @@ findpcb:
 				if (so->so_snd.sb_cc ||
 				    tp->t_flags & TF_NEEDOUTPUT)
 					(void) tcp_output(tp);
-				in_pcbsounlock_rele(inp, so);
+				if (solocked != NULL)
+					*solocked = so;
+				else
+					in_pcbsounlock_rele(inp, so);
 				in_pcbunref(inp);
 				return IPPROTO_DONE;
 			}
@@ -1074,7 +1135,10 @@ findpcb:
 			tp->t_flags &= ~TF_BLOCKOUTPUT;
 			if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
 				(void) tcp_output(tp);
-			in_pcbsounlock_rele(inp, so);
+			if (solocked != NULL)
+				*solocked = so;
+			else
+				in_pcbsounlock_rele(inp, so);
 			in_pcbunref(inp);
 			return IPPROTO_DONE;
 		}
@@ -2074,7 +2138,10 @@ dodata:							/* XXX */
 	 */
 	if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
 		(void) tcp_output(tp);
-	in_pcbsounlock_rele(inp, so);
+	if (solocked != NULL)
+		*solocked = so;
+	else
+		in_pcbsounlock_rele(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 
@@ -2104,7 +2171,10 @@ dropafterack:
 	m_freem(m);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
-	in_pcbsounlock_rele(inp, so);
+	if (solocked != NULL)
+		*solocked = so;
+	else
+		in_pcbsounlock_rele(inp, so);
 	in_pcbunref(inp);
 	return IPPROTO_DONE;
 
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.190 tcp_var.h
--- netinet/tcp_var.h	29 Apr 2025 20:31:42 -0000	1.190
+++ netinet/tcp_var.h	6 May 2025 19:21:02 -0000
@@ -718,6 +718,7 @@ int	 tcp_dooptions(struct tcpcb *, u_cha
 		struct mbuf *, int, struct tcp_opt_info *, u_int, uint64_t);
 void	 tcp_init(void);
 int	 tcp_input(struct mbuf **, int *, int, int, struct netstack *);
+void	 tcp_input_mlist(struct mbuf_list *, int);
 int	 tcp_mss(struct tcpcb *, int);
 void	 tcp_mss_update(struct tcpcb *);
 void	 tcp_softlro_glue(struct mbuf_list *, struct mbuf *, struct ifnet *);