Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
tcp receive unlock
To:
tech@openbsd.org
Date:
Sun, 10 Nov 2024 14:12:24 +0100

Download raw body.

Thread
  • Alexander Bluhm:

    tcp receive unlock

Hi,

This diff contains everything I think is needed to access the socket
buffer from read/write system call without exclusive netlock.

Idea is to hold socket lock while running tcp_output().

Please test.  Don't expect huge performance increase as tcp input
is still exclusively locked.  But tcp syscalls should behave a bit
smoother.

bluhm

Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.345 uipc_socket.c
--- kern/uipc_socket.c	8 Nov 2024 21:47:03 -0000	1.345
+++ kern/uipc_socket.c	10 Nov 2024 12:40:45 -0000
@@ -924,9 +924,11 @@ soreceive(struct socket *so, struct mbuf
 		flags = 0;
 	if (flags & MSG_OOB) {
 		m = m_get(M_WAIT, MT_DATA);
-		solock(so);
+		if (dosolock)
+			solock_shared(so);
 		error = pru_rcvoob(so, m, flags & MSG_PEEK);
-		sounlock(so);
+		if (dosolock)
+			sounlock_shared(so);
 		if (error)
 			goto bad;
 		do {
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.159 in_pcb.h
--- netinet/in_pcb.h	5 Nov 2024 10:49:23 -0000	1.159
+++ netinet/in_pcb.h	10 Nov 2024 12:40:45 -0000
@@ -140,7 +140,7 @@ struct inpcb {
 	u_int16_t inp_lport;		/* [t] local port */
 	struct	  socket *inp_socket;	/* [I] back pointer to socket */
 	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
-	struct    route inp_route;	/* cached route */
+	struct    route inp_route;	/* [s] cached route */
 	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
 	int	  inp_flags;		/* generic IP/datagram flags */
 	union {				/* Header prototype. */
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
diff -u -p -r1.113 in_proto.c
--- netinet/in_proto.c	22 Aug 2024 10:58:31 -0000	1.113
+++ netinet/in_proto.c	10 Nov 2024 12:40:45 -0000
@@ -197,7 +197,7 @@ const struct protosw inetsw[] = {
   .pr_type	= SOCK_STREAM,
   .pr_domain	= &inetdomain,
   .pr_protocol	= IPPROTO_TCP,
-  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|PR_MPSOCKET,
   .pr_input	= tcp_input,
   .pr_ctlinput	= tcp_ctlinput,
   .pr_ctloutput	= tcp_ctloutput,
Index: netinet/ip_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
diff -u -p -r1.401 ip_input.c
--- netinet/ip_input.c	6 Aug 2024 16:56:09 -0000	1.401
+++ netinet/ip_input.c	10 Nov 2024 12:40:45 -0000
@@ -83,23 +83,16 @@
 #include <netinet/ip_carp.h>
 #endif
 
-/*
- * Locks used to protect global variables in this file:
- *	I	immutable after creation
- *	a	atomic operations
- *	N	net lock
- */
-
 /* values controllable via sysctl */
-int	ip_forwarding = 0;			/* [a] */
+int	ip_forwarding = 0;
 int	ipmforwarding = 0;
 int	ipmultipath = 0;
-int	ip_sendredirects = 1;			/* [a] */
+int	ip_sendredirects = 1;
 int	ip_dosourceroute = 0;
 int	ip_defttl = IPDEFTTL;
 int	ip_mtudisc = 1;
 int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
-int	ip_directedbcast = 0;			/* [a] */
+int	ip_directedbcast = 0;
 
 /* Protects `ipq' and `ip_frags'. */
 struct mutex	ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
Index: netinet/ip_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
diff -u -p -r1.120 ip_var.h
--- netinet/ip_var.h	12 Jul 2024 19:50:35 -0000	1.120
+++ netinet/ip_var.h	10 Nov 2024 12:40:45 -0000
@@ -36,6 +36,13 @@
 #define _NETINET_IP_VAR_H_
 
 /*
+ * Locks used to protect global variables in this file:
+ *	I	immutable after creation
+ *	a	atomic operations
+ *	N	net lock
+ */
+
+/*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
@@ -216,19 +223,20 @@ extern int ip_defttl;			/* default IP tt
 
 #define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
 
-extern int ip_mtudisc;			/* mtu discovery */
+extern int ip_mtudisc;			/* [N] mtu discovery */
 extern int ip_mtudisc_timeout;		/* seconds to timeout mtu discovery */
 
 extern int ipport_firstauto;		/* min port for port allocation */
 extern int ipport_lastauto;		/* max port for port allocation */
 extern int ipport_hifirstauto;		/* min dynamic/private port number */
 extern int ipport_hilastauto;		/* max dynamic/private port number */
-extern int ip_forwarding;		/* enable IP forwarding */
+extern int ip_forwarding;		/* [a] enable IP forwarding */
 #ifdef MROUTING
 extern int ipmforwarding;		/* enable multicast forwarding */
 #endif
 extern int ipmultipath;			/* enable multipath routing */
-extern int ip_directedbcast;		/* accept all broadcast packets */
+extern int ip_sendredirects;		/* [a] send icmp redirect while forwd */
+extern int ip_directedbcast;		/* [a] accept all broadcast packets */
 extern unsigned int la_hold_total;
 
 extern const struct pr_usrreqs rip_usrreqs;
Index: netinet/tcp_fsm.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v
diff -u -p -r1.9 tcp_fsm.h
--- netinet/tcp_fsm.h	5 Feb 2018 14:53:26 -0000	1.9
+++ netinet/tcp_fsm.h	10 Nov 2024 12:40:45 -0000
@@ -68,7 +68,7 @@
  * determined by state, with the proviso that TH_FIN is sent only
  * if all data queued for output is included in the segment.
  */
-u_char	tcp_outflags[TCP_NSTATES] = {
+const u_char tcp_outflags[TCP_NSTATES] = {
     TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
     TH_ACK, TH_ACK,
     TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK,
@@ -76,7 +76,7 @@ u_char	tcp_outflags[TCP_NSTATES] = {
 #endif /* TCPOUTFLAGS */
 
 #ifdef	TCPSTATES
-const char *tcpstates[] = {
+const char *const tcpstates[] = {
 	"CLOSED",	"LISTEN",	"SYN_SENT",	"SYN_RCVD",
 	"ESTABLISHED",	"CLOSE_WAIT",	"FIN_WAIT_1",	"CLOSING",
 	"LAST_ACK",	"FIN_WAIT_2",	"TIME_WAIT",
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.408 tcp_input.c
--- netinet/tcp_input.c	8 Nov 2024 21:40:39 -0000	1.408
+++ netinet/tcp_input.c	10 Nov 2024 12:40:45 -0000
@@ -2790,11 +2790,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t
 int
 tcp_mss(struct tcpcb *tp, int offer)
 {
+	struct inpcb *inp;
 	struct rtentry *rt;
 	struct ifnet *ifp = NULL;
-	int mss, mssopt;
-	int iphlen;
-	struct inpcb *inp;
+	int mss, mssopt, iphlen, do_rfc3390;
+	u_int rtmtu;
 
 	inp = tp->t_inpcb;
 
@@ -2827,12 +2827,13 @@ tcp_mss(struct tcpcb *tp, int offer)
 	 * if there's an mtu associated with the route and we support
 	 * path MTU discovery for the underlying protocol family, use it.
 	 */
-	if (rt->rt_mtu) {
+	rtmtu = atomic_load_int(&rt->rt_mtu);
+	if (rtmtu) {
 		/*
 		 * One may wish to lower MSS to take into account options,
 		 * especially security-related options.
 		 */
-		if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
+		if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph: if path MTU is
 			 * smaller than 1280, use 1280 as packet size and
@@ -2841,8 +2842,7 @@ tcp_mss(struct tcpcb *tp, int offer)
 			mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
 			    sizeof(struct tcphdr);
 		} else {
-			mss = rt->rt_mtu - iphlen -
-			    sizeof(struct tcphdr);
+			mss = rtmtu - iphlen - sizeof(struct tcphdr);
 		}
 	} else if (ifp->if_flags & IFF_LOOPBACK) {
 		mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
@@ -2902,6 +2902,7 @@ tcp_mss(struct tcpcb *tp, int offer)
 		mss -= TCPOLEN_SIGLEN;
 #endif
 
+	do_rfc3390 = atomic_load_int(&tcp_do_rfc3390);
 	if (offer == -1) {
 		/* mss changed due to Path MTU discovery */
 		tp->t_flags &= ~TF_PMTUD_PEND;
@@ -2916,10 +2917,10 @@ tcp_mss(struct tcpcb *tp, int offer)
 			tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
 			    mss, mss);
 		}
-	} else if (tcp_do_rfc3390 == 2) {
+	} else if (do_rfc3390 == 2) {
 		/* increase initial window  */
 		tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
-	} else if (tcp_do_rfc3390) {
+	} else if (do_rfc3390) {
 		/* increase initial window  */
 		tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
 	} else
@@ -2986,8 +2987,10 @@ tcp_mss_update(struct tcpcb *tp)
 	if (rt == NULL)
 		return;
 
+	mtx_enter(&so->so_snd.sb_mtx);
 	bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss) {
+		mtx_leave(&so->so_snd.sb_mtx);
 		mss = bufsize;
 		/* Update t_maxseg and t_maxopd */
 		tcp_mss(tp, mss);
@@ -2996,8 +2999,10 @@ tcp_mss_update(struct tcpcb *tp)
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		(void)sbreserve(so, &so->so_snd, bufsize);
+		mtx_leave(&so->so_snd.sb_mtx);
 	}
 
+	mtx_enter(&so->so_rcv.sb_mtx);
 	bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
@@ -3005,7 +3010,7 @@ tcp_mss_update(struct tcpcb *tp)
 			bufsize = sb_max;
 		(void)sbreserve(so, &so->so_rcv, bufsize);
 	}
-
+	mtx_leave(&so->so_rcv.sb_mtx);
 }
 
 /*
@@ -3052,32 +3057,36 @@ tcp_newreno_partialack(struct tcpcb *tp,
 int
 tcp_mss_adv(struct mbuf *m, int af)
 {
-	int mss = 0;
-	int iphlen;
-	struct ifnet *ifp = NULL;
+	struct ifnet *ifp;
+	int iphlen, mss, mssdflt;
 
-	if (m && (m->m_flags & M_PKTHDR))
-		ifp = if_get(m->m_pkthdr.ph_ifidx);
+	mssdflt = atomic_load_int(&tcp_mssdflt);
+
+	if (m == NULL || (m->m_flags & M_PKTHDR) == 0)
+		return mssdflt;
+
+	ifp = if_get(m->m_pkthdr.ph_ifidx);
+	if (ifp == NULL)
+		return mssdflt;
 
 	switch (af) {
 	case AF_INET:
-		if (ifp != NULL)
-			mss = ifp->if_mtu;
 		iphlen = sizeof(struct ip);
 		break;
 #ifdef INET6
 	case AF_INET6:
-		if (ifp != NULL)
-			mss = ifp->if_mtu;
 		iphlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		unhandled_af(af);
 	}
+	mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 	if_put(ifp);
-	mss = mss - iphlen - sizeof(struct tcphdr);
-	return (max(mss, tcp_mssdflt));
+
+	if (mss < mssdflt)
+		return mssdflt;
+	return mss;
 }
 
 /*
Index: netinet/tcp_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
diff -u -p -r1.145 tcp_output.c
--- netinet/tcp_output.c	14 May 2024 09:39:02 -0000	1.145
+++ netinet/tcp_output.c	10 Nov 2024 12:40:45 -0000
@@ -103,8 +103,6 @@
 extern struct mbuf *m_copypack();
 #endif
 
-extern int tcprexmtthresh;
-
 #ifdef TCP_SACK_DEBUG
 void tcp_print_holes(struct tcpcb *tp);
 
@@ -350,7 +348,7 @@ again:
 	txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
 
 	if (len > txmaxseg) {
-		if (tcp_do_tso &&
+		if (atomic_load_int(&tcp_do_tso) &&
 		    tp->t_inpcb->inp_options == NULL &&
 		    tp->t_inpcb->inp_outputopts6 == NULL &&
 #ifdef TCP_SIGNATURE
Index: netinet/tcp_timer.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
diff -u -p -r1.76 tcp_timer.c
--- netinet/tcp_timer.c	28 Jan 2024 20:34:25 -0000	1.76
+++ netinet/tcp_timer.c	10 Nov 2024 12:40:45 -0000
@@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp)
 		TCP_TIMER_DISARM(tp, i);
 }
 
-int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
+const int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
 
-int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
+const int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
Index: netinet/tcp_timer.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
diff -u -p -r1.21 tcp_timer.h
--- netinet/tcp_timer.h	29 Jan 2024 22:47:13 -0000	1.21
+++ netinet/tcp_timer.h	10 Nov 2024 12:40:45 -0000
@@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *)
 extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
 
 extern int tcp_delack_msecs;		/* delayed ACK timeout in millisecs */
-extern int tcptv_keep_init;
 extern int tcp_always_keepalive;	/* assume SO_KEEPALIVE is always set */
 extern int tcp_keepidle;		/* time before keepalive probes begin */
 extern int tcp_keepintvl;		/* time between keepalive probes */
 extern int tcp_maxidle;			/* time to drop after starting probes */
 extern int tcp_ttl;			/* time to live for TCP segs */
-extern int tcp_backoff[];
+extern const int tcp_backoff[];
 
 void	tcp_timer_init(void);
 #endif /* _KERNEL */
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
+++ netinet/tcp_var.h	10 Nov 2024 12:40:45 -0000
@@ -228,6 +228,7 @@ struct tcp_opt_info {
 /*
  * Locks used to protect global data and struct members:
  *	I	immutable after creation
+ *	a	atomic operations
  *	N	net lock
  *	S	syn_cache_mtx		tcp syn cache global mutex
  */
@@ -678,16 +679,17 @@ extern	const struct pr_usrreqs tcp6_usrr
 extern	struct pool tcpcb_pool;
 extern	struct inpcbtable tcbtable, tcb6table;	/* queue of active tcpcb's */
 extern	int tcp_do_rfc1323;	/* enabled/disabled? */
+extern	int tcprexmtthresh;	/* [I] */
 extern	int tcptv_keep_init;	/* [N] time to keep alive initial SYN packet */
-extern	int tcp_mssdflt;	/* default maximum segment size */
+extern	int tcp_mssdflt;	/* [N] default maximum segment size */
 extern	int tcp_rst_ppslim;	/* maximum outgoing RST packet per second */
 extern	int tcp_ack_on_push;	/* ACK immediately on PUSH */
 extern	int tcp_do_sack;	/* SACK enabled/disabled */
 extern	struct pool sackhl_pool;
 extern	int tcp_sackhole_limit;	/* max entries for tcp sack queues */
-extern	int tcp_do_ecn;		/* RFC3168 ECN enabled/disabled? */
-extern	int tcp_do_rfc3390;	/* RFC3390 Increasing TCP's Initial Window */
-extern	int tcp_do_tso;		/* enable TSO for TCP output packets */
+extern	int tcp_do_ecn;		/* [N] RFC3168 ECN enabled/disabled? */
+extern	int tcp_do_rfc3390;	/* [a] RFC3390 Increasing TCP Initial Window */
+extern	int tcp_do_tso;		/* [a] enable TSO for TCP output packets */
 
 extern	struct pool tcpqe_pool;
 extern	int tcp_reass_limit;	/* max entries for tcp reass queues */
Index: netinet6/in6_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
diff -u -p -r1.119 in6_proto.c
--- netinet6/in6_proto.c	20 Aug 2024 07:46:27 -0000	1.119
+++ netinet6/in6_proto.c	10 Nov 2024 12:40:45 -0000
@@ -147,7 +147,7 @@ const struct protosw inet6sw[] = {
   .pr_type	= SOCK_STREAM,
   .pr_domain	= &inet6domain,
   .pr_protocol	= IPPROTO_TCP,
-  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+  .pr_flags	= PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|PR_MPSOCKET,
   .pr_input	= tcp_input,
   .pr_ctlinput	= tcp6_ctlinput,
   .pr_ctloutput	= tcp_ctloutput,
Index: sys/mbuf.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.265 mbuf.h
--- sys/mbuf.h	5 Nov 2024 13:15:13 -0000	1.265
+++ sys/mbuf.h	10 Nov 2024 12:40:45 -0000
@@ -411,7 +411,7 @@ struct mbuf_queue {
 struct pool;
 
 extern	long nmbclust;			/* limit on the # of clusters */
-extern	int max_linkhdr;		/* largest link-level header */
+extern	int max_linkhdr;		/* [I] largest link-level header */
 extern	int max_protohdr;		/* largest protocol header */
 extern	int max_hdr;			/* largest link+protocol header */
 extern	struct cpumem *mbstat;		/* mbuf statistics counter */
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.134 socketvar.h
--- sys/socketvar.h	9 Sep 2024 07:38:45 -0000	1.134
+++ sys/socketvar.h	10 Nov 2024 12:40:45 -0000
@@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb)
 	}								\
 } while (/*CONSTCOND*/0)
 
-extern u_long sb_max;
+extern u_long sb_max;			/* [I] */
 
 extern struct pool	socket_pool;