From: Alexander Bluhm Subject: Re: tcp syn cache shared netlock To: Vitaliy Makkoveev Cc: tech@openbsd.org Date: Fri, 27 Sep 2024 14:27:50 +0200 On Thu, Aug 15, 2024 at 06:44:18PM +0300, Vitaliy Makkoveev wrote: > On Thu, Aug 15, 2024 at 04:56:56PM +0200, Alexander Bluhm wrote: > > On Thu, Aug 15, 2024 at 05:31:53PM +0300, Vitaliy Makkoveev wrote: > > > On Wed, Aug 14, 2024 at 10:37:48PM +0200, Alexander Bluhm wrote: > > > > Hi Vitaliy, > > > > > > > > Can we use the socket lock of the listen socket for the syn cache? > > > > Using this lock to protect the various TCP sockets looks like the > > > > next reasonable step. > > > > > > > > What do you think? > > > > > > > > bluhm > > > > > > Hi Alexander, > > > > > > Sometimes, I think about per-`inp' lock for TCP pcb. A lot of work > > > happening independent of sockets layer, so no reason to intersections > > > with them. It looks reasonable to me, because I want to take `so_lock' > > > before shared netlock and push the second to the pcb layer. This is just > > > a thought, I haven't made any diffs yet. > > > > > > Your diff looks reasonable, but what about `inp_lock'? > > > > We had an inp_mutex before, that was no longer used. All the magic > > happens in sb_mtx, so I removed it. > > > > Now I need a rwlock as we have to hold it over ip_output(). And > > that may sleep waiting for pf lock. > > > > so_state and so_options are accessed from both socket and TCP layer. > > My conclusion is that a common lock makes sense. UDP does not call > > soisdisconnected(), so we get away with less locking. > > > > Of course we can introduce more fine grained locks. But I want to > > lock individual TCP pcb and sockets now and run them in parallel. > > Too many different locks make things complicated. Grabbing a lock > > also takes some time. The next quick win should be running TCP > > with shared netlock and use a rwlock for each (tcpcb, inpcb, socket) > > combined. > > > > I assume hypothetical `inp_lock' as rwlock, we often use _lock postfix > for them, instead of _mtx for mutexes. > > Could TCP pcb exist without associated socket? Hypothetically it could > because in_pcbdetach() does in_pcbunref(). In this case you can't use > socket lock. Are you 100% sure, concurrent in_pcbdetach() can't kill > `inp_socket' before NET_LOCK_SHARED()? This is a valid point. As a quick fix we can use the exclusive netlock for sofree(). In in_pcbdetach() set inp->inp_socket = NULL, and in syn_cache_timer() check inp->inp_socket != NULL while holding shared netlock. Of corse we have to reconsider this when we make socket destruction MP safe. But for now it is good enough. I want to continue unlocking TCP with this hack. Currently I think we should use inp->inp_socket->so_lock to protect common structures of the socket and TCP state. We will see how that works. ok? bluhm Index: netinet/in_pcb.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v diff -u -p -r1.303 in_pcb.c --- netinet/in_pcb.c 12 Jul 2024 19:50:35 -0000 1.303 +++ netinet/in_pcb.c 26 Sep 2024 10:56:12 -0000 @@ -591,6 +591,7 @@ in_pcbdetach(struct inpcb *inp) * points. */ sofree(so, 1); + inp->inp_socket = NULL; if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; Index: netinet/in_pcb.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v diff -u -p -r1.158 in_pcb.h --- netinet/in_pcb.h 12 Jul 2024 19:50:35 -0000 1.158 +++ netinet/in_pcb.h 26 Sep 2024 10:56:12 -0000 @@ -138,7 +138,7 @@ struct inpcb { #define inp_laddr6 inp_laddru.iau_addr6 u_int16_t inp_fport; /* [t] foreign port */ u_int16_t inp_lport; /* [t] local port */ - struct socket *inp_socket; /* [I] back pointer to socket */ + struct socket *inp_socket; /* [N] back pointer to socket */ caddr_t inp_ppcb; /* pointer to per-protocol pcb */ struct route inp_route; /* cached route */ struct refcnt inp_refcnt; /* refcount PCB, delay memory free */ Index: netinet/tcp_input.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v diff -u -p -r1.407 tcp_input.c --- netinet/tcp_input.c 26 Aug 2024 13:55:14 -0000 1.407 +++ netinet/tcp_input.c 26 Sep 2024 10:56:12 -0000 @@ -3147,7 +3147,8 @@ syn_cache_rm(struct syn_cache *sc) KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); SET(sc->sc_dynflags, SCF_DEAD); TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); - sc->sc_tp = NULL; + in_pcbunref(sc->sc_inplisten); + sc->sc_inplisten = NULL; LIST_REMOVE(sc, sc_tpq); refcnt_rele(&sc->sc_refcnt); sc->sc_buckethead->sch_length--; @@ -3341,6 +3342,7 @@ void syn_cache_timer(void *arg) { struct syn_cache *sc = arg; + struct inpcb *inp; uint64_t now; int lastref; @@ -3369,14 +3371,23 @@ syn_cache_timer(void *arg) TCPTV_REXMTMAX); if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) refcnt_take(&sc->sc_refcnt); + inp = in_pcbref(sc->sc_inplisten); + if (inp == NULL) + goto freeit; mtx_leave(&syn_cache_mtx); - NET_LOCK(); now = tcp_now(); - (void) syn_cache_respond(sc, NULL, now); - tcpstat_inc(tcps_sc_retransmitted); - NET_UNLOCK(); + NET_LOCK_SHARED(); + if (inp->inp_socket != NULL) { + /* only repond if socket has not been freed */ + rw_enter_write(&inp->inp_socket->so_lock); + (void) syn_cache_respond(sc, NULL, now); + rw_exit_write(&inp->inp_socket->so_lock); + tcpstat_inc(tcps_sc_retransmitted); + } + NET_UNLOCK_SHARED(); + in_pcbunref(sc->sc_inplisten); syn_cache_put(sc); return; @@ -3406,10 +3417,7 @@ syn_cache_cleanup(struct tcpcb *tp) mtx_enter(&syn_cache_mtx); LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { -#ifdef DIAGNOSTIC - if (sc->sc_tp != tp) - panic("invalid sc_tp in syn_cache_cleanup"); -#endif + KASSERT(sc->sc_inplisten == tp->t_inpcb); syn_cache_rm(sc); syn_cache_put(sc); } @@ -3491,11 +3499,13 @@ syn_cache_get(struct sockaddr *src, stru u_int rtableid; NET_ASSERT_LOCKED(); + rw_enter_write(&so->so_lock); mtx_enter(&syn_cache_mtx); sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); if (sc == NULL) { mtx_leave(&syn_cache_mtx); + rw_exit_write(&so->so_lock); return (NULL); } @@ -3509,6 +3519,7 @@ syn_cache_get(struct sockaddr *src, stru refcnt_take(&sc->sc_refcnt); mtx_leave(&syn_cache_mtx); (void) syn_cache_respond(sc, m, now); + rw_exit_write(&so->so_lock); syn_cache_put(sc); return ((struct socket *)(-1)); } @@ -3652,6 +3663,7 @@ syn_cache_get(struct sockaddr *src, stru tp->rcv_adv = tp->rcv_nxt + sc->sc_win; tp->last_ack_sent = tp->rcv_nxt; + rw_exit_write(&oso->so_lock); tcpstat_inc(tcps_sc_completed); syn_cache_put(sc); return (so); @@ -3660,6 +3672,7 @@ resetandabort: tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, m->m_pkthdr.ph_rtableid, now); abort: + rw_exit_write(&oso->so_lock); m_freem(m); if (so != NULL) soabort(so); @@ -3767,7 +3780,7 @@ syn_cache_add(struct sockaddr *src, stru struct mbuf *ipopts; NET_ASSERT_LOCKED(); - + rw_enter_write(&so->so_lock); tp = sototcpcb(so); /* @@ -3798,8 +3811,10 @@ syn_cache_add(struct sockaddr *src, stru #endif tb.t_state = TCPS_LISTEN; if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, - sotoinpcb(so)->inp_rtableid, now)) + sotoinpcb(so)->inp_rtableid, now)) { + rw_exit_write(&so->so_lock); return (-1); + } } switch (src->sa_family) { @@ -3837,6 +3852,7 @@ syn_cache_add(struct sockaddr *src, stru tcpstat_inc(tcps_sndacks); tcpstat_inc(tcps_sndtotal); } + rw_exit_write(&so->so_lock); syn_cache_put(sc); return (0); } @@ -3844,6 +3860,7 @@ syn_cache_add(struct sockaddr *src, stru sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); if (sc == NULL) { + rw_exit_write(&so->so_lock); m_free(ipopts); return (-1); } @@ -3920,19 +3937,22 @@ syn_cache_add(struct sockaddr *src, stru if (tb.t_flags & TF_SIGNATURE) SET(sc->sc_fixflags, SCF_SIGNATURE); #endif - sc->sc_tp = tp; + sc->sc_inplisten = in_pcbref(tp->t_inpcb); if (syn_cache_respond(sc, m, now) == 0) { mtx_enter(&syn_cache_mtx); /* - * XXXSMP Currently exclusive netlock prevents another insert - * after our syn_cache_lookup() and before syn_cache_insert(). - * Double insert should be handled and not rely on netlock. + * Socket lock prevents another insert after our + * syn_cache_lookup() and before syn_cache_insert(). */ syn_cache_insert(sc, tp); mtx_leave(&syn_cache_mtx); + rw_exit_write(&so->so_lock); tcpstat_inc(tcps_sndacks); tcpstat_inc(tcps_sndtotal); } else { + in_pcbunref(sc->sc_inplisten); + sc->sc_inplisten = NULL; + rw_exit_write(&so->so_lock); syn_cache_put(sc); tcpstat_inc(tcps_sc_dropped); } @@ -4129,7 +4149,7 @@ syn_cache_respond(struct syn_cache *sc, /* use IPsec policy and ttl from listening socket, on SYN ACK */ mtx_enter(&syn_cache_mtx); - inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; + inp = sc->sc_inplisten; mtx_leave(&syn_cache_mtx); /* Index: netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v diff -u -p -r1.178 tcp_var.h --- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178 +++ netinet/tcp_var.h 26 Sep 2024 10:56:12 -0000 @@ -230,6 +230,7 @@ struct tcp_opt_info { * I immutable after creation * N net lock * S syn_cache_mtx tcp syn cache global mutex + * s so_lock socket lock of listen socket */ extern struct mutex syn_cache_mtx; @@ -247,11 +248,11 @@ struct syn_cache { TAILQ_ENTRY(syn_cache) sc_bucketq; /* [S] link on bucket list */ struct refcnt sc_refcnt; /* ref count list and timer */ struct timeout sc_timer; /* rexmt timer */ - struct route sc_route; /* [N] cached route */ + struct route sc_route; /* [s] cached route */ long sc_win; /* [I] advertised window */ struct syn_cache_head *sc_buckethead; /* [S] our bucket index */ struct syn_cache_set *sc_set; /* [S] our syn cache set */ - u_int64_t sc_timestamp; /* [N] timestamp from SYN */ + u_int64_t sc_timestamp; /* [s] timestamp from SYN */ u_int32_t sc_hash; /* [S] */ u_int32_t sc_modulate; /* [I] our timestamp modulator */ union syn_cache_sa sc_src; /* [I] */ @@ -272,13 +273,13 @@ struct syn_cache { #define SCF_ECN_PERMIT 0x0040U /* permit ecn */ #define SCF_SIGNATURE 0x0080U /* enforce tcp signatures */ - struct mbuf *sc_ipopts; /* [N] IP options */ + struct mbuf *sc_ipopts; /* [s] IP options */ u_int16_t sc_peermaxseg; /* [I] */ u_int16_t sc_ourmaxseg; /* [I] */ u_int sc_request_r_scale : 4, /* [I] */ sc_requested_s_scale : 4; /* [I] */ - struct tcpcb *sc_tp; /* [S] tcb for listening socket */ + struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */ LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */ };