Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
tcp syn cache shared netlock
To:
tech@openbsd.org
Date:
Wed, 7 Aug 2024 14:24:30 +0200

Download raw body.

Thread
  • Alexander Bluhm:

    tcp syn cache shared netlock

Hi,

Running TCP SYN cache timer with shared net lock can be done by
using the socket lock of the listen socket.

All SYN cache functions grab this lock in addition with shares net
lock.  The rest of TCP stack still has exclusive net lock, so it
cannot interfere.  The SYN cache entry must now refence count the
inpcb of the listen socket.

ok?

bluhm

Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.406 tcp_input.c
--- netinet/tcp_input.c	7 Jun 2024 08:02:17 -0000	1.406
+++ netinet/tcp_input.c	7 Aug 2024 12:10:42 -0000
@@ -3146,7 +3146,8 @@ syn_cache_rm(struct syn_cache *sc)
 	KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
 	SET(sc->sc_dynflags, SCF_DEAD);
 	TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
-	sc->sc_tp = NULL;
+	in_pcbunref(sc->sc_inplisten);
+	sc->sc_inplisten = NULL;
 	LIST_REMOVE(sc, sc_tpq);
 	refcnt_rele(&sc->sc_refcnt);
 	sc->sc_buckethead->sch_length--;
@@ -3340,6 +3341,7 @@ void
 syn_cache_timer(void *arg)
 {
 	struct syn_cache *sc = arg;
+	struct inpcb *inp;
 	uint64_t now;
 	int lastref;
 
@@ -3368,14 +3370,20 @@ syn_cache_timer(void *arg)
 	    TCPTV_REXMTMAX);
 	if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
 		refcnt_take(&sc->sc_refcnt);
+	inp = in_pcbref(sc->sc_inplisten);
+	if (inp == NULL)
+		goto freeit;
 	mtx_leave(&syn_cache_mtx);
 
-	NET_LOCK();
 	now = tcp_now();
+	NET_LOCK_SHARED();
+	rw_enter_write(&inp->inp_socket->so_lock);
 	(void) syn_cache_respond(sc, NULL, now);
+	rw_exit_write(&inp->inp_socket->so_lock);
+	NET_UNLOCK_SHARED();
 	tcpstat_inc(tcps_sc_retransmitted);
-	NET_UNLOCK();
 
+	in_pcbunref(sc->sc_inplisten);
 	syn_cache_put(sc);
 	return;
 
@@ -3405,10 +3413,7 @@ syn_cache_cleanup(struct tcpcb *tp)
 
 	mtx_enter(&syn_cache_mtx);
 	LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
-#ifdef DIAGNOSTIC
-		if (sc->sc_tp != tp)
-			panic("invalid sc_tp in syn_cache_cleanup");
-#endif
+		KASSERT(sc->sc_inplisten == tp->t_inpcb);
 		syn_cache_rm(sc);
 		syn_cache_put(sc);
 	}
@@ -3490,11 +3495,13 @@ syn_cache_get(struct sockaddr *src, stru
 	u_int rtableid;
 
 	NET_ASSERT_LOCKED();
+	rw_enter_write(&so->so_lock);
 
 	mtx_enter(&syn_cache_mtx);
 	sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
 	if (sc == NULL) {
 		mtx_leave(&syn_cache_mtx);
+		rw_exit_write(&so->so_lock);
 		return (NULL);
 	}
 
@@ -3508,6 +3515,7 @@ syn_cache_get(struct sockaddr *src, stru
 		refcnt_take(&sc->sc_refcnt);
 		mtx_leave(&syn_cache_mtx);
 		(void) syn_cache_respond(sc, m, now);
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		return ((struct socket *)(-1));
 	}
@@ -3651,6 +3659,7 @@ syn_cache_get(struct sockaddr *src, stru
 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 	tp->last_ack_sent = tp->rcv_nxt;
 
+	rw_exit_write(&oso->so_lock);
 	tcpstat_inc(tcps_sc_completed);
 	syn_cache_put(sc);
 	return (so);
@@ -3659,6 +3668,7 @@ resetandabort:
 	tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST,
 	    m->m_pkthdr.ph_rtableid, now);
 abort:
+	rw_exit_write(&oso->so_lock);
 	m_freem(m);
 	if (so != NULL)
 		soabort(so);
@@ -3766,7 +3776,7 @@ syn_cache_add(struct sockaddr *src, stru
 	struct mbuf *ipopts;
 
 	NET_ASSERT_LOCKED();
-
+	rw_enter_write(&so->so_lock);
 	tp = sototcpcb(so);
 
 	/*
@@ -3797,8 +3807,10 @@ syn_cache_add(struct sockaddr *src, stru
 #endif
 		tb.t_state = TCPS_LISTEN;
 		if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi,
-		    sotoinpcb(so)->inp_rtableid, now))
+		    sotoinpcb(so)->inp_rtableid, now)) {
+			rw_exit_write(&so->so_lock);
 			return (-1);
+		}
 	}
 
 	switch (src->sa_family) {
@@ -3836,6 +3848,7 @@ syn_cache_add(struct sockaddr *src, stru
 			tcpstat_inc(tcps_sndacks);
 			tcpstat_inc(tcps_sndtotal);
 		}
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		return (0);
 	}
@@ -3843,6 +3856,7 @@ syn_cache_add(struct sockaddr *src, stru
 
 	sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
 	if (sc == NULL) {
+		rw_exit_write(&so->so_lock);
 		m_free(ipopts);
 		return (-1);
 	}
@@ -3919,19 +3933,22 @@ syn_cache_add(struct sockaddr *src, stru
 	if (tb.t_flags & TF_SIGNATURE)
 		SET(sc->sc_fixflags, SCF_SIGNATURE);
 #endif
-	sc->sc_tp = tp;
+	sc->sc_inplisten = in_pcbref(tp->t_inpcb);
 	if (syn_cache_respond(sc, m, now) == 0) {
 		mtx_enter(&syn_cache_mtx);
 		/*
-		 * XXXSMP Currently exclusive netlock prevents another insert
-		 * after our syn_cache_lookup() and before syn_cache_insert().
-		 * Double insert should be handled and not rely on netlock.
+		 * Socket lock prevents another insert after our
+		 * syn_cache_lookup() and before syn_cache_insert().
 		 */
 		syn_cache_insert(sc, tp);
 		mtx_leave(&syn_cache_mtx);
+		rw_exit_write(&so->so_lock);
 		tcpstat_inc(tcps_sndacks);
 		tcpstat_inc(tcps_sndtotal);
 	} else {
+		in_pcbunref(sc->sc_inplisten);
+		sc->sc_inplisten = NULL;
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		tcpstat_inc(tcps_sc_dropped);
 	}
@@ -4128,7 +4145,7 @@ syn_cache_respond(struct syn_cache *sc, 
 
 	/* use IPsec policy and ttl from listening socket, on SYN ACK */
 	mtx_enter(&syn_cache_mtx);
-	inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
+	inp = sc->sc_inplisten;
 	mtx_leave(&syn_cache_mtx);
 
 	/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
+++ netinet/tcp_var.h	7 Aug 2024 11:48:15 -0000
@@ -230,6 +230,7 @@ struct tcp_opt_info {
  *	I	immutable after creation
  *	N	net lock
  *	S	syn_cache_mtx		tcp syn cache global mutex
+ *	s	so_lock			socket lock of listen socket
  */
 
 extern struct mutex syn_cache_mtx;
@@ -247,11 +248,11 @@ struct syn_cache {
 	TAILQ_ENTRY(syn_cache) sc_bucketq;	/* [S] link on bucket list */
 	struct refcnt sc_refcnt;		/* ref count list and timer */
 	struct timeout sc_timer;		/* rexmt timer */
-	struct route sc_route;			/* [N] cached route */
+	struct route sc_route;			/* [s] cached route */
 	long sc_win;				/* [I] advertised window */
 	struct syn_cache_head *sc_buckethead;	/* [S] our bucket index */
 	struct syn_cache_set *sc_set;		/* [S] our syn cache set */
-	u_int64_t sc_timestamp;		/* [N] timestamp from SYN */
+	u_int64_t sc_timestamp;		/* [s] timestamp from SYN */
 	u_int32_t sc_hash;		/* [S] */
 	u_int32_t sc_modulate;		/* [I] our timestamp modulator */
 	union syn_cache_sa sc_src;	/* [I] */
@@ -272,13 +273,13 @@ struct syn_cache {
 #define SCF_ECN_PERMIT	0x0040U		/* permit ecn */
 #define SCF_SIGNATURE	0x0080U		/* enforce tcp signatures */
 
-	struct mbuf *sc_ipopts;			/* [N] IP options */
+	struct mbuf *sc_ipopts;			/* [s] IP options */
 	u_int16_t sc_peermaxseg;		/* [I] */
 	u_int16_t sc_ourmaxseg;			/* [I] */
 	u_int     sc_request_r_scale	: 4,	/* [I] */
 		  sc_requested_s_scale	: 4;	/* [I] */
 
-	struct tcpcb *sc_tp;		/* [S] tcb for listening socket */
+	struct inpcb *sc_inplisten;	/* [S] inpcb for listening socket */
 	LIST_ENTRY(syn_cache) sc_tpq;	/* [S] list of entries by same tp */
 };