Download raw body.
tcp receive unlock
Hi,
This diff contains everything I think is needed to access the socket
buffer from read/write system call without exclusive netlock.
Idea is to hold socket lock while running tcp_output().
Please test. Don't expect huge performance increase as tcp input
is still exclusively locked. But tcp syscalls should behave a bit
smoother.
bluhm
Index: kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
diff -u -p -r1.345 uipc_socket.c
--- kern/uipc_socket.c 8 Nov 2024 21:47:03 -0000 1.345
+++ kern/uipc_socket.c 10 Nov 2024 12:40:45 -0000
@@ -924,9 +924,11 @@ soreceive(struct socket *so, struct mbuf
flags = 0;
if (flags & MSG_OOB) {
m = m_get(M_WAIT, MT_DATA);
- solock(so);
+ if (dosolock)
+ solock_shared(so);
error = pru_rcvoob(so, m, flags & MSG_PEEK);
- sounlock(so);
+ if (dosolock)
+ sounlock_shared(so);
if (error)
goto bad;
do {
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.159 in_pcb.h
--- netinet/in_pcb.h 5 Nov 2024 10:49:23 -0000 1.159
+++ netinet/in_pcb.h 10 Nov 2024 12:40:45 -0000
@@ -140,7 +140,7 @@ struct inpcb {
u_int16_t inp_lport; /* [t] local port */
struct socket *inp_socket; /* [I] back pointer to socket */
caddr_t inp_ppcb; /* pointer to per-protocol pcb */
- struct route inp_route; /* cached route */
+ struct route inp_route; /* [s] cached route */
struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
int inp_flags; /* generic IP/datagram flags */
union { /* Header prototype. */
Index: netinet/in_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
diff -u -p -r1.113 in_proto.c
--- netinet/in_proto.c 22 Aug 2024 10:58:31 -0000 1.113
+++ netinet/in_proto.c 10 Nov 2024 12:40:45 -0000
@@ -197,7 +197,7 @@ const struct protosw inetsw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|PR_MPSOCKET,
.pr_input = tcp_input,
.pr_ctlinput = tcp_ctlinput,
.pr_ctloutput = tcp_ctloutput,
Index: netinet/ip_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
diff -u -p -r1.401 ip_input.c
--- netinet/ip_input.c 6 Aug 2024 16:56:09 -0000 1.401
+++ netinet/ip_input.c 10 Nov 2024 12:40:45 -0000
@@ -83,23 +83,16 @@
#include <netinet/ip_carp.h>
#endif
-/*
- * Locks used to protect global variables in this file:
- * I immutable after creation
- * a atomic operations
- * N net lock
- */
-
/* values controllable via sysctl */
-int ip_forwarding = 0; /* [a] */
+int ip_forwarding = 0;
int ipmforwarding = 0;
int ipmultipath = 0;
-int ip_sendredirects = 1; /* [a] */
+int ip_sendredirects = 1;
int ip_dosourceroute = 0;
int ip_defttl = IPDEFTTL;
int ip_mtudisc = 1;
int ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
-int ip_directedbcast = 0; /* [a] */
+int ip_directedbcast = 0;
/* Protects `ipq' and `ip_frags'. */
struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET);
Index: netinet/ip_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
diff -u -p -r1.120 ip_var.h
--- netinet/ip_var.h 12 Jul 2024 19:50:35 -0000 1.120
+++ netinet/ip_var.h 10 Nov 2024 12:40:45 -0000
@@ -36,6 +36,13 @@
#define _NETINET_IP_VAR_H_
/*
+ * Locks used to protect global variables in this file:
+ * I immutable after creation
+ * a atomic operations
+ * N net lock
+ */
+
+/*
* Structure stored in mbuf in inpcb.ip_options
* and passed to ip_output when ip options are in use.
* The actual length of the options (including ipopt_dst)
@@ -216,19 +223,20 @@ extern int ip_defttl; /* default IP tt
#define IPMTUDISCTIMEOUT (10 * 60) /* as per RFC 1191 */
-extern int ip_mtudisc; /* mtu discovery */
+extern int ip_mtudisc; /* [N] mtu discovery */
extern int ip_mtudisc_timeout; /* seconds to timeout mtu discovery */
extern int ipport_firstauto; /* min port for port allocation */
extern int ipport_lastauto; /* max port for port allocation */
extern int ipport_hifirstauto; /* min dynamic/private port number */
extern int ipport_hilastauto; /* max dynamic/private port number */
-extern int ip_forwarding; /* enable IP forwarding */
+extern int ip_forwarding; /* [a] enable IP forwarding */
#ifdef MROUTING
extern int ipmforwarding; /* enable multicast forwarding */
#endif
extern int ipmultipath; /* enable multipath routing */
-extern int ip_directedbcast; /* accept all broadcast packets */
+extern int ip_sendredirects; /* [a] send icmp redirect while forwd */
+extern int ip_directedbcast; /* [a] accept all broadcast packets */
extern unsigned int la_hold_total;
extern const struct pr_usrreqs rip_usrreqs;
Index: netinet/tcp_fsm.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_fsm.h,v
diff -u -p -r1.9 tcp_fsm.h
--- netinet/tcp_fsm.h 5 Feb 2018 14:53:26 -0000 1.9
+++ netinet/tcp_fsm.h 10 Nov 2024 12:40:45 -0000
@@ -68,7 +68,7 @@
* determined by state, with the proviso that TH_FIN is sent only
* if all data queued for output is included in the segment.
*/
-u_char tcp_outflags[TCP_NSTATES] = {
+const u_char tcp_outflags[TCP_NSTATES] = {
TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK,
TH_ACK, TH_ACK,
TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK,
@@ -76,7 +76,7 @@ u_char tcp_outflags[TCP_NSTATES] = {
#endif /* TCPOUTFLAGS */
#ifdef TCPSTATES
-const char *tcpstates[] = {
+const char *const tcpstates[] = {
"CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD",
"ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
"LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.408 tcp_input.c
--- netinet/tcp_input.c 8 Nov 2024 21:40:39 -0000 1.408
+++ netinet/tcp_input.c 10 Nov 2024 12:40:45 -0000
@@ -2790,11 +2790,11 @@ tcp_xmit_timer(struct tcpcb *tp, int32_t
int
tcp_mss(struct tcpcb *tp, int offer)
{
+ struct inpcb *inp;
struct rtentry *rt;
struct ifnet *ifp = NULL;
- int mss, mssopt;
- int iphlen;
- struct inpcb *inp;
+ int mss, mssopt, iphlen, do_rfc3390;
+ u_int rtmtu;
inp = tp->t_inpcb;
@@ -2827,12 +2827,13 @@ tcp_mss(struct tcpcb *tp, int offer)
* if there's an mtu associated with the route and we support
* path MTU discovery for the underlying protocol family, use it.
*/
- if (rt->rt_mtu) {
+ rtmtu = atomic_load_int(&rt->rt_mtu);
+ if (rtmtu) {
/*
* One may wish to lower MSS to take into account options,
* especially security-related options.
*/
- if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
+ if (tp->pf == AF_INET6 && rtmtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph: if path MTU is
* smaller than 1280, use 1280 as packet size and
@@ -2841,8 +2842,7 @@ tcp_mss(struct tcpcb *tp, int offer)
mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
sizeof(struct tcphdr);
} else {
- mss = rt->rt_mtu - iphlen -
- sizeof(struct tcphdr);
+ mss = rtmtu - iphlen - sizeof(struct tcphdr);
}
} else if (ifp->if_flags & IFF_LOOPBACK) {
mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
@@ -2902,6 +2902,7 @@ tcp_mss(struct tcpcb *tp, int offer)
mss -= TCPOLEN_SIGLEN;
#endif
+ do_rfc3390 = atomic_load_int(&tcp_do_rfc3390);
if (offer == -1) {
/* mss changed due to Path MTU discovery */
tp->t_flags &= ~TF_PMTUD_PEND;
@@ -2916,10 +2917,10 @@ tcp_mss(struct tcpcb *tp, int offer)
tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
mss, mss);
}
- } else if (tcp_do_rfc3390 == 2) {
+ } else if (do_rfc3390 == 2) {
/* increase initial window */
tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
- } else if (tcp_do_rfc3390) {
+ } else if (do_rfc3390) {
/* increase initial window */
tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
} else
@@ -2986,8 +2987,10 @@ tcp_mss_update(struct tcpcb *tp)
if (rt == NULL)
return;
+ mtx_enter(&so->so_snd.sb_mtx);
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss) {
+ mtx_leave(&so->so_snd.sb_mtx);
mss = bufsize;
/* Update t_maxseg and t_maxopd */
tcp_mss(tp, mss);
@@ -2996,8 +2999,10 @@ tcp_mss_update(struct tcpcb *tp)
if (bufsize > sb_max)
bufsize = sb_max;
(void)sbreserve(so, &so->so_snd, bufsize);
+ mtx_leave(&so->so_snd.sb_mtx);
}
+ mtx_enter(&so->so_rcv.sb_mtx);
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -3005,7 +3010,7 @@ tcp_mss_update(struct tcpcb *tp)
bufsize = sb_max;
(void)sbreserve(so, &so->so_rcv, bufsize);
}
-
+ mtx_leave(&so->so_rcv.sb_mtx);
}
/*
@@ -3052,32 +3057,36 @@ tcp_newreno_partialack(struct tcpcb *tp,
int
tcp_mss_adv(struct mbuf *m, int af)
{
- int mss = 0;
- int iphlen;
- struct ifnet *ifp = NULL;
+ struct ifnet *ifp;
+ int iphlen, mss, mssdflt;
- if (m && (m->m_flags & M_PKTHDR))
- ifp = if_get(m->m_pkthdr.ph_ifidx);
+ mssdflt = atomic_load_int(&tcp_mssdflt);
+
+ if (m == NULL || (m->m_flags & M_PKTHDR) == 0)
+ return mssdflt;
+
+ ifp = if_get(m->m_pkthdr.ph_ifidx);
+ if (ifp == NULL)
+ return mssdflt;
switch (af) {
case AF_INET:
- if (ifp != NULL)
- mss = ifp->if_mtu;
iphlen = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
- if (ifp != NULL)
- mss = ifp->if_mtu;
iphlen = sizeof(struct ip6_hdr);
break;
#endif
default:
unhandled_af(af);
}
+ mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
if_put(ifp);
- mss = mss - iphlen - sizeof(struct tcphdr);
- return (max(mss, tcp_mssdflt));
+
+ if (mss < mssdflt)
+ return mssdflt;
+ return mss;
}
/*
Index: netinet/tcp_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
diff -u -p -r1.145 tcp_output.c
--- netinet/tcp_output.c 14 May 2024 09:39:02 -0000 1.145
+++ netinet/tcp_output.c 10 Nov 2024 12:40:45 -0000
@@ -103,8 +103,6 @@
extern struct mbuf *m_copypack();
#endif
-extern int tcprexmtthresh;
-
#ifdef TCP_SACK_DEBUG
void tcp_print_holes(struct tcpcb *tp);
@@ -350,7 +348,7 @@ again:
txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
if (len > txmaxseg) {
- if (tcp_do_tso &&
+ if (atomic_load_int(&tcp_do_tso) &&
tp->t_inpcb->inp_options == NULL &&
tp->t_inpcb->inp_outputopts6 == NULL &&
#ifdef TCP_SIGNATURE
Index: netinet/tcp_timer.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.c,v
diff -u -p -r1.76 tcp_timer.c
--- netinet/tcp_timer.c 28 Jan 2024 20:34:25 -0000 1.76
+++ netinet/tcp_timer.c 10 Nov 2024 12:40:45 -0000
@@ -167,10 +167,10 @@ tcp_canceltimers(struct tcpcb *tp)
TCP_TIMER_DISARM(tp, i);
}
-int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
+const int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
{ 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
-int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
+const int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
/*
* TCP timer processing.
Index: netinet/tcp_timer.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_timer.h,v
diff -u -p -r1.21 tcp_timer.h
--- netinet/tcp_timer.h 29 Jan 2024 22:47:13 -0000 1.21
+++ netinet/tcp_timer.h 10 Nov 2024 12:40:45 -0000
@@ -154,13 +154,12 @@ typedef void (*tcp_timer_func_t)(void *)
extern const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS];
extern int tcp_delack_msecs; /* delayed ACK timeout in millisecs */
-extern int tcptv_keep_init;
extern int tcp_always_keepalive; /* assume SO_KEEPALIVE is always set */
extern int tcp_keepidle; /* time before keepalive probes begin */
extern int tcp_keepintvl; /* time between keepalive probes */
extern int tcp_maxidle; /* time to drop after starting probes */
extern int tcp_ttl; /* time to live for TCP segs */
-extern int tcp_backoff[];
+extern const int tcp_backoff[];
void tcp_timer_init(void);
#endif /* _KERNEL */
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178
+++ netinet/tcp_var.h 10 Nov 2024 12:40:45 -0000
@@ -228,6 +228,7 @@ struct tcp_opt_info {
/*
* Locks used to protect global data and struct members:
* I immutable after creation
+ * a atomic operations
* N net lock
* S syn_cache_mtx tcp syn cache global mutex
*/
@@ -678,16 +679,17 @@ extern const struct pr_usrreqs tcp6_usrr
extern struct pool tcpcb_pool;
extern struct inpcbtable tcbtable, tcb6table; /* queue of active tcpcb's */
extern int tcp_do_rfc1323; /* enabled/disabled? */
+extern int tcprexmtthresh; /* [I] */
extern int tcptv_keep_init; /* [N] time to keep alive initial SYN packet */
-extern int tcp_mssdflt; /* default maximum segment size */
+extern int tcp_mssdflt; /* [N] default maximum segment size */
extern int tcp_rst_ppslim; /* maximum outgoing RST packet per second */
extern int tcp_ack_on_push; /* ACK immediately on PUSH */
extern int tcp_do_sack; /* SACK enabled/disabled */
extern struct pool sackhl_pool;
extern int tcp_sackhole_limit; /* max entries for tcp sack queues */
-extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */
-extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */
-extern int tcp_do_tso; /* enable TSO for TCP output packets */
+extern int tcp_do_ecn; /* [N] RFC3168 ECN enabled/disabled? */
+extern int tcp_do_rfc3390; /* [a] RFC3390 Increasing TCP Initial Window */
+extern int tcp_do_tso; /* [a] enable TSO for TCP output packets */
extern struct pool tcpqe_pool;
extern int tcp_reass_limit; /* max entries for tcp reass queues */
Index: netinet6/in6_proto.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
diff -u -p -r1.119 in6_proto.c
--- netinet6/in6_proto.c 20 Aug 2024 07:46:27 -0000 1.119
+++ netinet6/in6_proto.c 10 Nov 2024 12:40:45 -0000
@@ -147,7 +147,7 @@ const struct protosw inet6sw[] = {
.pr_type = SOCK_STREAM,
.pr_domain = &inet6domain,
.pr_protocol = IPPROTO_TCP,
- .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE|PR_MPSOCKET,
.pr_input = tcp_input,
.pr_ctlinput = tcp6_ctlinput,
.pr_ctloutput = tcp_ctloutput,
Index: sys/mbuf.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
diff -u -p -r1.265 mbuf.h
--- sys/mbuf.h 5 Nov 2024 13:15:13 -0000 1.265
+++ sys/mbuf.h 10 Nov 2024 12:40:45 -0000
@@ -411,7 +411,7 @@ struct mbuf_queue {
struct pool;
extern long nmbclust; /* limit on the # of clusters */
-extern int max_linkhdr; /* largest link-level header */
+extern int max_linkhdr; /* [I] largest link-level header */
extern int max_protohdr; /* largest protocol header */
extern int max_hdr; /* largest link+protocol header */
extern struct cpumem *mbstat; /* mbuf statistics counter */
Index: sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
diff -u -p -r1.134 socketvar.h
--- sys/socketvar.h 9 Sep 2024 07:38:45 -0000 1.134
+++ sys/socketvar.h 10 Nov 2024 12:40:45 -0000
@@ -376,7 +376,7 @@ sbassertlocked(struct sockbuf *sb)
} \
} while (/*CONSTCOND*/0)
-extern u_long sb_max;
+extern u_long sb_max; /* [I] */
extern struct pool socket_pool;
tcp receive unlock