Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
Re: tcp syn cache shared netlock
To:
Vitaliy Makkoveev <mvs@openbsd.org>
Cc:
tech@openbsd.org
Date:
Fri, 27 Sep 2024 14:27:50 +0200

Download raw body.

Thread
  • Alexander Bluhm:

    tcp syn cache shared netlock

On Thu, Aug 15, 2024 at 06:44:18PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Aug 15, 2024 at 04:56:56PM +0200, Alexander Bluhm wrote:
> > On Thu, Aug 15, 2024 at 05:31:53PM +0300, Vitaliy Makkoveev wrote:
> > > On Wed, Aug 14, 2024 at 10:37:48PM +0200, Alexander Bluhm wrote:
> > > > Hi Vitaliy,
> > > > 
> > > > Can we use the socket lock of the listen socket for the syn cache?
> > > > Using this lock to protect the various TCP sockets looks like the
> > > > next reasonable step.
> > > > 
> > > > What do you think?
> > > > 
> > > > bluhm
> > > 
> > > Hi Alexander,
> > > 
> > > Sometimes, I think about per-`inp' lock for TCP pcb. A lot of work
> > > happening independent of sockets layer, so no reason to intersections
> > > with them. It looks reasonable to me, because I want to take `so_lock'
> > > before shared netlock and push the second to the pcb layer. This is just
> > > a thought, I haven't made any diffs yet.
> > > 
> > > Your diff looks reasonable, but what about `inp_lock'?
> > 
> > We had an inp_mutex before, that was no longer used.  All the magic
> > happens in sb_mtx, so I removed it.
> > 
> > Now I need a rwlock as we have to hold it over ip_output().  And
> > that may sleep waiting for pf lock.
> > 
> > so_state and so_options are accessed from both socket and TCP layer.
> > My conclusion is that a common lock makes sense.  UDP does not call
> > soisdisconnected(), so we get away with less locking.
> > 
> > Of course we can introduce more fine grained locks.  But I want to
> > lock individual TCP pcb and sockets now and run them in parallel.
> > Too many different locks make things complicated.  Grabbing a lock
> > also takes some time.  The next quick win should be running TCP
> > with shared netlock and use a rwlock for each (tcpcb, inpcb, socket)
> > combined.
> > 
> 
> I assume hypothetical `inp_lock' as rwlock, we often use _lock postfix
> for them, instead of _mtx for mutexes.
> 
> Could TCP pcb exist without associated socket? Hypothetically it could
> because in_pcbdetach() does in_pcbunref(). In this case you can't use
> socket lock. Are you 100% sure, concurrent in_pcbdetach() can't kill
> `inp_socket' before NET_LOCK_SHARED()?

This is a valid point.  As a quick fix we can use the exclusive
netlock for sofree().  In in_pcbdetach() set inp->inp_socket = NULL,
and in syn_cache_timer() check inp->inp_socket != NULL while holding
shared netlock.

Of corse we have to reconsider this when we make socket destruction
MP safe.  But for now it is good enough.  I want to continue unlocking
TCP with this hack.  Currently I think we should use
inp->inp_socket->so_lock to protect common structures of the socket
and TCP state.  We will see how that works.

ok?

bluhm

Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.303 in_pcb.c
--- netinet/in_pcb.c	12 Jul 2024 19:50:35 -0000	1.303
+++ netinet/in_pcb.c	26 Sep 2024 10:56:12 -0000
@@ -591,6 +591,7 @@ in_pcbdetach(struct inpcb *inp)
 	 * points.
 	 */
 	sofree(so, 1);
+	inp->inp_socket = NULL;
 	if (inp->inp_route.ro_rt) {
 		rtfree(inp->inp_route.ro_rt);
 		inp->inp_route.ro_rt = NULL;
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.158 in_pcb.h
--- netinet/in_pcb.h	12 Jul 2024 19:50:35 -0000	1.158
+++ netinet/in_pcb.h	26 Sep 2024 10:56:12 -0000
@@ -138,7 +138,7 @@ struct inpcb {
 #define	inp_laddr6	inp_laddru.iau_addr6
 	u_int16_t inp_fport;		/* [t] foreign port */
 	u_int16_t inp_lport;		/* [t] local port */
-	struct	  socket *inp_socket;	/* [I] back pointer to socket */
+	struct	  socket *inp_socket;	/* [N] back pointer to socket */
 	caddr_t	  inp_ppcb;		/* pointer to per-protocol pcb */
 	struct    route inp_route;	/* cached route */
 	struct    refcnt inp_refcnt;	/* refcount PCB, delay memory free */
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.407 tcp_input.c
--- netinet/tcp_input.c	26 Aug 2024 13:55:14 -0000	1.407
+++ netinet/tcp_input.c	26 Sep 2024 10:56:12 -0000
@@ -3147,7 +3147,8 @@ syn_cache_rm(struct syn_cache *sc)
 	KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
 	SET(sc->sc_dynflags, SCF_DEAD);
 	TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
-	sc->sc_tp = NULL;
+	in_pcbunref(sc->sc_inplisten);
+	sc->sc_inplisten = NULL;
 	LIST_REMOVE(sc, sc_tpq);
 	refcnt_rele(&sc->sc_refcnt);
 	sc->sc_buckethead->sch_length--;
@@ -3341,6 +3342,7 @@ void
 syn_cache_timer(void *arg)
 {
 	struct syn_cache *sc = arg;
+	struct inpcb *inp;
 	uint64_t now;
 	int lastref;
 
@@ -3369,14 +3371,23 @@ syn_cache_timer(void *arg)
 	    TCPTV_REXMTMAX);
 	if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
 		refcnt_take(&sc->sc_refcnt);
+	inp = in_pcbref(sc->sc_inplisten);
+	if (inp == NULL)
+		goto freeit;
 	mtx_leave(&syn_cache_mtx);
 
-	NET_LOCK();
 	now = tcp_now();
-	(void) syn_cache_respond(sc, NULL, now);
-	tcpstat_inc(tcps_sc_retransmitted);
-	NET_UNLOCK();
+	NET_LOCK_SHARED();
+	if (inp->inp_socket != NULL) {
+		/* only repond if socket has not been freed */
+		rw_enter_write(&inp->inp_socket->so_lock);
+		(void) syn_cache_respond(sc, NULL, now);
+		rw_exit_write(&inp->inp_socket->so_lock);
+		tcpstat_inc(tcps_sc_retransmitted);
+	}
+	NET_UNLOCK_SHARED();
 
+	in_pcbunref(sc->sc_inplisten);
 	syn_cache_put(sc);
 	return;
 
@@ -3406,10 +3417,7 @@ syn_cache_cleanup(struct tcpcb *tp)
 
 	mtx_enter(&syn_cache_mtx);
 	LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
-#ifdef DIAGNOSTIC
-		if (sc->sc_tp != tp)
-			panic("invalid sc_tp in syn_cache_cleanup");
-#endif
+		KASSERT(sc->sc_inplisten == tp->t_inpcb);
 		syn_cache_rm(sc);
 		syn_cache_put(sc);
 	}
@@ -3491,11 +3499,13 @@ syn_cache_get(struct sockaddr *src, stru
 	u_int rtableid;
 
 	NET_ASSERT_LOCKED();
+	rw_enter_write(&so->so_lock);
 
 	mtx_enter(&syn_cache_mtx);
 	sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
 	if (sc == NULL) {
 		mtx_leave(&syn_cache_mtx);
+		rw_exit_write(&so->so_lock);
 		return (NULL);
 	}
 
@@ -3509,6 +3519,7 @@ syn_cache_get(struct sockaddr *src, stru
 		refcnt_take(&sc->sc_refcnt);
 		mtx_leave(&syn_cache_mtx);
 		(void) syn_cache_respond(sc, m, now);
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		return ((struct socket *)(-1));
 	}
@@ -3652,6 +3663,7 @@ syn_cache_get(struct sockaddr *src, stru
 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 	tp->last_ack_sent = tp->rcv_nxt;
 
+	rw_exit_write(&oso->so_lock);
 	tcpstat_inc(tcps_sc_completed);
 	syn_cache_put(sc);
 	return (so);
@@ -3660,6 +3672,7 @@ resetandabort:
 	tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST,
 	    m->m_pkthdr.ph_rtableid, now);
 abort:
+	rw_exit_write(&oso->so_lock);
 	m_freem(m);
 	if (so != NULL)
 		soabort(so);
@@ -3767,7 +3780,7 @@ syn_cache_add(struct sockaddr *src, stru
 	struct mbuf *ipopts;
 
 	NET_ASSERT_LOCKED();
-
+	rw_enter_write(&so->so_lock);
 	tp = sototcpcb(so);
 
 	/*
@@ -3798,8 +3811,10 @@ syn_cache_add(struct sockaddr *src, stru
 #endif
 		tb.t_state = TCPS_LISTEN;
 		if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi,
-		    sotoinpcb(so)->inp_rtableid, now))
+		    sotoinpcb(so)->inp_rtableid, now)) {
+			rw_exit_write(&so->so_lock);
 			return (-1);
+		}
 	}
 
 	switch (src->sa_family) {
@@ -3837,6 +3852,7 @@ syn_cache_add(struct sockaddr *src, stru
 			tcpstat_inc(tcps_sndacks);
 			tcpstat_inc(tcps_sndtotal);
 		}
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		return (0);
 	}
@@ -3844,6 +3860,7 @@ syn_cache_add(struct sockaddr *src, stru
 
 	sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
 	if (sc == NULL) {
+		rw_exit_write(&so->so_lock);
 		m_free(ipopts);
 		return (-1);
 	}
@@ -3920,19 +3937,22 @@ syn_cache_add(struct sockaddr *src, stru
 	if (tb.t_flags & TF_SIGNATURE)
 		SET(sc->sc_fixflags, SCF_SIGNATURE);
 #endif
-	sc->sc_tp = tp;
+	sc->sc_inplisten = in_pcbref(tp->t_inpcb);
 	if (syn_cache_respond(sc, m, now) == 0) {
 		mtx_enter(&syn_cache_mtx);
 		/*
-		 * XXXSMP Currently exclusive netlock prevents another insert
-		 * after our syn_cache_lookup() and before syn_cache_insert().
-		 * Double insert should be handled and not rely on netlock.
+		 * Socket lock prevents another insert after our
+		 * syn_cache_lookup() and before syn_cache_insert().
 		 */
 		syn_cache_insert(sc, tp);
 		mtx_leave(&syn_cache_mtx);
+		rw_exit_write(&so->so_lock);
 		tcpstat_inc(tcps_sndacks);
 		tcpstat_inc(tcps_sndtotal);
 	} else {
+		in_pcbunref(sc->sc_inplisten);
+		sc->sc_inplisten = NULL;
+		rw_exit_write(&so->so_lock);
 		syn_cache_put(sc);
 		tcpstat_inc(tcps_sc_dropped);
 	}
@@ -4129,7 +4149,7 @@ syn_cache_respond(struct syn_cache *sc, 
 
 	/* use IPsec policy and ttl from listening socket, on SYN ACK */
 	mtx_enter(&syn_cache_mtx);
-	inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
+	inp = sc->sc_inplisten;
 	mtx_leave(&syn_cache_mtx);
 
 	/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h	13 May 2024 01:15:53 -0000	1.178
+++ netinet/tcp_var.h	26 Sep 2024 10:56:12 -0000
@@ -230,6 +230,7 @@ struct tcp_opt_info {
  *	I	immutable after creation
  *	N	net lock
  *	S	syn_cache_mtx		tcp syn cache global mutex
+ *	s	so_lock			socket lock of listen socket
  */
 
 extern struct mutex syn_cache_mtx;
@@ -247,11 +248,11 @@ struct syn_cache {
 	TAILQ_ENTRY(syn_cache) sc_bucketq;	/* [S] link on bucket list */
 	struct refcnt sc_refcnt;		/* ref count list and timer */
 	struct timeout sc_timer;		/* rexmt timer */
-	struct route sc_route;			/* [N] cached route */
+	struct route sc_route;			/* [s] cached route */
 	long sc_win;				/* [I] advertised window */
 	struct syn_cache_head *sc_buckethead;	/* [S] our bucket index */
 	struct syn_cache_set *sc_set;		/* [S] our syn cache set */
-	u_int64_t sc_timestamp;		/* [N] timestamp from SYN */
+	u_int64_t sc_timestamp;		/* [s] timestamp from SYN */
 	u_int32_t sc_hash;		/* [S] */
 	u_int32_t sc_modulate;		/* [I] our timestamp modulator */
 	union syn_cache_sa sc_src;	/* [I] */
@@ -272,13 +273,13 @@ struct syn_cache {
 #define SCF_ECN_PERMIT	0x0040U		/* permit ecn */
 #define SCF_SIGNATURE	0x0080U		/* enforce tcp signatures */
 
-	struct mbuf *sc_ipopts;			/* [N] IP options */
+	struct mbuf *sc_ipopts;			/* [s] IP options */
 	u_int16_t sc_peermaxseg;		/* [I] */
 	u_int16_t sc_ourmaxseg;			/* [I] */
 	u_int     sc_request_r_scale	: 4,	/* [I] */
 		  sc_requested_s_scale	: 4;	/* [I] */
 
-	struct tcpcb *sc_tp;		/* [S] tcb for listening socket */
+	struct inpcb *sc_inplisten;	/* [S] inpcb for listening socket */
 	LIST_ENTRY(syn_cache) sc_tpq;	/* [S] list of entries by same tp */
 };