Download raw body.
tcp syn cache shared netlock
On Thu, Aug 15, 2024 at 06:44:18PM +0300, Vitaliy Makkoveev wrote:
> On Thu, Aug 15, 2024 at 04:56:56PM +0200, Alexander Bluhm wrote:
> > On Thu, Aug 15, 2024 at 05:31:53PM +0300, Vitaliy Makkoveev wrote:
> > > On Wed, Aug 14, 2024 at 10:37:48PM +0200, Alexander Bluhm wrote:
> > > > Hi Vitaliy,
> > > >
> > > > Can we use the socket lock of the listen socket for the syn cache?
> > > > Using this lock to protect the various TCP sockets looks like the
> > > > next reasonable step.
> > > >
> > > > What do you think?
> > > >
> > > > bluhm
> > >
> > > Hi Alexander,
> > >
> > > Sometimes, I think about per-`inp' lock for TCP pcb. A lot of work
> > > happening independent of sockets layer, so no reason to intersections
> > > with them. It looks reasonable to me, because I want to take `so_lock'
> > > before shared netlock and push the second to the pcb layer. This is just
> > > a thought, I haven't made any diffs yet.
> > >
> > > Your diff looks reasonable, but what about `inp_lock'?
> >
> > We had an inp_mutex before, that was no longer used. All the magic
> > happens in sb_mtx, so I removed it.
> >
> > Now I need a rwlock as we have to hold it over ip_output(). And
> > that may sleep waiting for pf lock.
> >
> > so_state and so_options are accessed from both socket and TCP layer.
> > My conclusion is that a common lock makes sense. UDP does not call
> > soisdisconnected(), so we get away with less locking.
> >
> > Of course we can introduce more fine grained locks. But I want to
> > lock individual TCP pcb and sockets now and run them in parallel.
> > Too many different locks make things complicated. Grabbing a lock
> > also takes some time. The next quick win should be running TCP
> > with shared netlock and use a rwlock for each (tcpcb, inpcb, socket)
> > combined.
> >
>
> I assume hypothetical `inp_lock' as rwlock, we often use _lock postfix
> for them, instead of _mtx for mutexes.
>
> Could TCP pcb exist without associated socket? Hypothetically it could
> because in_pcbdetach() does in_pcbunref(). In this case you can't use
> socket lock. Are you 100% sure, concurrent in_pcbdetach() can't kill
> `inp_socket' before NET_LOCK_SHARED()?
This is a valid point. As a quick fix we can use the exclusive
netlock for sofree(). In in_pcbdetach() set inp->inp_socket = NULL,
and in syn_cache_timer() check inp->inp_socket != NULL while holding
shared netlock.
Of corse we have to reconsider this when we make socket destruction
MP safe. But for now it is good enough. I want to continue unlocking
TCP with this hack. Currently I think we should use
inp->inp_socket->so_lock to protect common structures of the socket
and TCP state. We will see how that works.
ok?
bluhm
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
diff -u -p -r1.303 in_pcb.c
--- netinet/in_pcb.c 12 Jul 2024 19:50:35 -0000 1.303
+++ netinet/in_pcb.c 26 Sep 2024 10:56:12 -0000
@@ -591,6 +591,7 @@ in_pcbdetach(struct inpcb *inp)
* points.
*/
sofree(so, 1);
+ inp->inp_socket = NULL;
if (inp->inp_route.ro_rt) {
rtfree(inp->inp_route.ro_rt);
inp->inp_route.ro_rt = NULL;
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
diff -u -p -r1.158 in_pcb.h
--- netinet/in_pcb.h 12 Jul 2024 19:50:35 -0000 1.158
+++ netinet/in_pcb.h 26 Sep 2024 10:56:12 -0000
@@ -138,7 +138,7 @@ struct inpcb {
#define inp_laddr6 inp_laddru.iau_addr6
u_int16_t inp_fport; /* [t] foreign port */
u_int16_t inp_lport; /* [t] local port */
- struct socket *inp_socket; /* [I] back pointer to socket */
+ struct socket *inp_socket; /* [N] back pointer to socket */
caddr_t inp_ppcb; /* pointer to per-protocol pcb */
struct route inp_route; /* cached route */
struct refcnt inp_refcnt; /* refcount PCB, delay memory free */
Index: netinet/tcp_input.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v
diff -u -p -r1.407 tcp_input.c
--- netinet/tcp_input.c 26 Aug 2024 13:55:14 -0000 1.407
+++ netinet/tcp_input.c 26 Sep 2024 10:56:12 -0000
@@ -3147,7 +3147,8 @@ syn_cache_rm(struct syn_cache *sc)
KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD));
SET(sc->sc_dynflags, SCF_DEAD);
TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
- sc->sc_tp = NULL;
+ in_pcbunref(sc->sc_inplisten);
+ sc->sc_inplisten = NULL;
LIST_REMOVE(sc, sc_tpq);
refcnt_rele(&sc->sc_refcnt);
sc->sc_buckethead->sch_length--;
@@ -3341,6 +3342,7 @@ void
syn_cache_timer(void *arg)
{
struct syn_cache *sc = arg;
+ struct inpcb *inp;
uint64_t now;
int lastref;
@@ -3369,14 +3371,23 @@ syn_cache_timer(void *arg)
TCPTV_REXMTMAX);
if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur))
refcnt_take(&sc->sc_refcnt);
+ inp = in_pcbref(sc->sc_inplisten);
+ if (inp == NULL)
+ goto freeit;
mtx_leave(&syn_cache_mtx);
- NET_LOCK();
now = tcp_now();
- (void) syn_cache_respond(sc, NULL, now);
- tcpstat_inc(tcps_sc_retransmitted);
- NET_UNLOCK();
+ NET_LOCK_SHARED();
+ if (inp->inp_socket != NULL) {
+ /* only repond if socket has not been freed */
+ rw_enter_write(&inp->inp_socket->so_lock);
+ (void) syn_cache_respond(sc, NULL, now);
+ rw_exit_write(&inp->inp_socket->so_lock);
+ tcpstat_inc(tcps_sc_retransmitted);
+ }
+ NET_UNLOCK_SHARED();
+ in_pcbunref(sc->sc_inplisten);
syn_cache_put(sc);
return;
@@ -3406,10 +3417,7 @@ syn_cache_cleanup(struct tcpcb *tp)
mtx_enter(&syn_cache_mtx);
LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
-#ifdef DIAGNOSTIC
- if (sc->sc_tp != tp)
- panic("invalid sc_tp in syn_cache_cleanup");
-#endif
+ KASSERT(sc->sc_inplisten == tp->t_inpcb);
syn_cache_rm(sc);
syn_cache_put(sc);
}
@@ -3491,11 +3499,13 @@ syn_cache_get(struct sockaddr *src, stru
u_int rtableid;
NET_ASSERT_LOCKED();
+ rw_enter_write(&so->so_lock);
mtx_enter(&syn_cache_mtx);
sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
if (sc == NULL) {
mtx_leave(&syn_cache_mtx);
+ rw_exit_write(&so->so_lock);
return (NULL);
}
@@ -3509,6 +3519,7 @@ syn_cache_get(struct sockaddr *src, stru
refcnt_take(&sc->sc_refcnt);
mtx_leave(&syn_cache_mtx);
(void) syn_cache_respond(sc, m, now);
+ rw_exit_write(&so->so_lock);
syn_cache_put(sc);
return ((struct socket *)(-1));
}
@@ -3652,6 +3663,7 @@ syn_cache_get(struct sockaddr *src, stru
tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
tp->last_ack_sent = tp->rcv_nxt;
+ rw_exit_write(&oso->so_lock);
tcpstat_inc(tcps_sc_completed);
syn_cache_put(sc);
return (so);
@@ -3660,6 +3672,7 @@ resetandabort:
tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST,
m->m_pkthdr.ph_rtableid, now);
abort:
+ rw_exit_write(&oso->so_lock);
m_freem(m);
if (so != NULL)
soabort(so);
@@ -3767,7 +3780,7 @@ syn_cache_add(struct sockaddr *src, stru
struct mbuf *ipopts;
NET_ASSERT_LOCKED();
-
+ rw_enter_write(&so->so_lock);
tp = sototcpcb(so);
/*
@@ -3798,8 +3811,10 @@ syn_cache_add(struct sockaddr *src, stru
#endif
tb.t_state = TCPS_LISTEN;
if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi,
- sotoinpcb(so)->inp_rtableid, now))
+ sotoinpcb(so)->inp_rtableid, now)) {
+ rw_exit_write(&so->so_lock);
return (-1);
+ }
}
switch (src->sa_family) {
@@ -3837,6 +3852,7 @@ syn_cache_add(struct sockaddr *src, stru
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
}
+ rw_exit_write(&so->so_lock);
syn_cache_put(sc);
return (0);
}
@@ -3844,6 +3860,7 @@ syn_cache_add(struct sockaddr *src, stru
sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
if (sc == NULL) {
+ rw_exit_write(&so->so_lock);
m_free(ipopts);
return (-1);
}
@@ -3920,19 +3937,22 @@ syn_cache_add(struct sockaddr *src, stru
if (tb.t_flags & TF_SIGNATURE)
SET(sc->sc_fixflags, SCF_SIGNATURE);
#endif
- sc->sc_tp = tp;
+ sc->sc_inplisten = in_pcbref(tp->t_inpcb);
if (syn_cache_respond(sc, m, now) == 0) {
mtx_enter(&syn_cache_mtx);
/*
- * XXXSMP Currently exclusive netlock prevents another insert
- * after our syn_cache_lookup() and before syn_cache_insert().
- * Double insert should be handled and not rely on netlock.
+ * Socket lock prevents another insert after our
+ * syn_cache_lookup() and before syn_cache_insert().
*/
syn_cache_insert(sc, tp);
mtx_leave(&syn_cache_mtx);
+ rw_exit_write(&so->so_lock);
tcpstat_inc(tcps_sndacks);
tcpstat_inc(tcps_sndtotal);
} else {
+ in_pcbunref(sc->sc_inplisten);
+ sc->sc_inplisten = NULL;
+ rw_exit_write(&so->so_lock);
syn_cache_put(sc);
tcpstat_inc(tcps_sc_dropped);
}
@@ -4129,7 +4149,7 @@ syn_cache_respond(struct syn_cache *sc,
/* use IPsec policy and ttl from listening socket, on SYN ACK */
mtx_enter(&syn_cache_mtx);
- inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
+ inp = sc->sc_inplisten;
mtx_leave(&syn_cache_mtx);
/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
diff -u -p -r1.178 tcp_var.h
--- netinet/tcp_var.h 13 May 2024 01:15:53 -0000 1.178
+++ netinet/tcp_var.h 26 Sep 2024 10:56:12 -0000
@@ -230,6 +230,7 @@ struct tcp_opt_info {
* I immutable after creation
* N net lock
* S syn_cache_mtx tcp syn cache global mutex
+ * s so_lock socket lock of listen socket
*/
extern struct mutex syn_cache_mtx;
@@ -247,11 +248,11 @@ struct syn_cache {
TAILQ_ENTRY(syn_cache) sc_bucketq; /* [S] link on bucket list */
struct refcnt sc_refcnt; /* ref count list and timer */
struct timeout sc_timer; /* rexmt timer */
- struct route sc_route; /* [N] cached route */
+ struct route sc_route; /* [s] cached route */
long sc_win; /* [I] advertised window */
struct syn_cache_head *sc_buckethead; /* [S] our bucket index */
struct syn_cache_set *sc_set; /* [S] our syn cache set */
- u_int64_t sc_timestamp; /* [N] timestamp from SYN */
+ u_int64_t sc_timestamp; /* [s] timestamp from SYN */
u_int32_t sc_hash; /* [S] */
u_int32_t sc_modulate; /* [I] our timestamp modulator */
union syn_cache_sa sc_src; /* [I] */
@@ -272,13 +273,13 @@ struct syn_cache {
#define SCF_ECN_PERMIT 0x0040U /* permit ecn */
#define SCF_SIGNATURE 0x0080U /* enforce tcp signatures */
- struct mbuf *sc_ipopts; /* [N] IP options */
+ struct mbuf *sc_ipopts; /* [s] IP options */
u_int16_t sc_peermaxseg; /* [I] */
u_int16_t sc_ourmaxseg; /* [I] */
u_int sc_request_r_scale : 4, /* [I] */
sc_requested_s_scale : 4; /* [I] */
- struct tcpcb *sc_tp; /* [S] tcb for listening socket */
+ struct inpcb *sc_inplisten; /* [S] inpcb for listening socket */
LIST_ENTRY(syn_cache) sc_tpq; /* [S] list of entries by same tp */
};
tcp syn cache shared netlock